diff --git a/README.rst b/README.rst
index 233f347b..90c84b6e 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.5.1
+PyWb 0.5.2
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py
index e066d4d1..5bbb65b8 100644
--- a/pywb/framework/test/test_wbrequestresponse.py
+++ b/pywb/framework/test/test_wbrequestresponse.py
@@ -40,13 +40,13 @@
# WbResponse Tests
# =================
>>> WbResponse.text_response('Test')
-{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
+{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain'), ('Content-Length', '4')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
-{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
+{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])}
"""
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 0f1a9f32..da456474 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -125,7 +125,7 @@ class WbRequest(object):
if not self.wb_url:
return
- mime = self.env.get('CONTENT_TYPE')
+ mime = self.env.get('CONTENT_TYPE').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
@@ -152,23 +152,31 @@ class WbResponse(object):
pass
@staticmethod
- def text_stream(stream, status='200 OK', content_type='text/plain'):
- status_headers = StatusAndHeaders(status,
- [('Content-Type', content_type)])
+ def text_stream(stream, status='200 OK', content_type='text/plain',
+ headers=None):
+ def_headers = [('Content-Type', content_type)]
+ if headers:
+ def_headers += headers
+
+ status_headers = StatusAndHeaders(status, def_headers)
return WbResponse(status_headers, value=stream)
@staticmethod
def text_response(text, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
- [('Content-Type', content_type)])
+ [('Content-Type', content_type),
+ ('Content-Length', str(len(text)))])
return WbResponse(status_headers, value=[text])
@staticmethod
- def redir_response(location, status='302 Redirect'):
- return WbResponse(StatusAndHeaders(status,
- [('Location', location)]))
+ def redir_response(location, status='302 Redirect', headers=None):
+ redir_headers = [('Location', location), ('Content-Length', '0')]
+ if headers:
+ redir_headers += headers
+
+ return WbResponse(StatusAndHeaders(status, redir_headers))
def __call__(self, env, start_response):
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index ec93593a..93ec396b 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -1,6 +1,8 @@
#import chardet
import pkgutil
import yaml
+import re
+
from chardet.universaldetector import UniversalDetector
from io import BytesIO
@@ -52,11 +54,12 @@ class RewriteContent:
return (rewritten_headers, stream)
- def rewrite_content(self, urlrewriter, headers, stream,
+ def rewrite_content(self, wb_url, urlrewriter, headers, stream,
head_insert_func=None, urlkey='',
- sanitize_only=False, cdx=None, mod=None):
+ cdx=None):
- if sanitize_only:
+ if (wb_url.is_identity or
+ (not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False)
@@ -78,6 +81,8 @@ class RewriteContent:
# see known js/css modifier specified, the context should run
# default text_type
+ mod = wb_url.mod
+
if mod == 'js_':
text_type = 'js'
elif mod == 'cs_':
@@ -118,6 +123,10 @@ class RewriteContent:
if head_insert_func:
head_insert_str = head_insert_func(rule, cdx)
+ if wb_url.is_banner_only:
+ gen = self._head_insert_only_gen(head_insert_str, stream)
+ return (status_headers, gen, False)
+
rewriter = rewriter_class(urlrewriter,
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
@@ -125,7 +134,10 @@ class RewriteContent:
defmod=self.defmod)
else:
- # apply one of (js, css, xml) rewriters
+ if wb_url.is_banner_only:
+ return (status_headers, self.stream_to_gen(stream), False)
+
+ # apply one of (js, css, xml) rewriters
rewriter = rewriter_class(urlrewriter)
# Create rewriting generator
@@ -134,6 +146,32 @@ class RewriteContent:
return (status_headers, gen, True)
+ HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
+
+ def _head_insert_only_gen(self, insert_str, stream):
+ max_len = 1024
+ buff = ''
+ while max_len > 0:
+ curr = stream.read(max_len)
+ if not curr:
+ break
+
+ max_len -= len(buff)
+ buff += curr
+
+ matcher = self.HEAD_REGEX.search(buff)
+
+ if matcher:
+ yield buff[:matcher.end()] + insert_str
+ yield buff[matcher.end():]
+ else:
+ yield insert_str
+ yield buff
+
+ for buff in self.stream_to_gen(stream):
+ yield buff
+
+
# Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding,
stream, first_buff=None):
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index b81b0144..97024600 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -14,8 +14,9 @@ from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
-from pywb.rewrite.url_rewriter import UrlRewriter
-from pywb.rewrite.rewrite_content import RewriteContent
+from url_rewriter import UrlRewriter
+from wburl import WbUrl
+from rewrite_content import RewriteContent
#=================================================================
@@ -114,15 +115,20 @@ class LiveRewriter(object):
return (status_headers, stream)
- def fetch_request(self, url, urlrewriter,
+ def fetch_request(self, wb_url, urlrewriter,
head_insert_func=None,
urlkey=None,
env=None,
req_headers={},
timestamp=None,
follow_redirects=False,
- proxies=None,
- mod=None):
+ proxies=None):
+
+ if isinstance(wb_url, str):
+ url = wb_url
+ wb_url = WbUrl(url)
+ else:
+ url = wb_url.url
ts_err = url.split('///')
@@ -155,13 +161,13 @@ class LiveRewriter(object):
}
result = (self.rewriter.
- rewrite_content(urlrewriter,
+ rewrite_content(wb_url,
+ urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
urlkey=urlkey,
- cdx=cdx,
- mod=mod))
+ cdx=cdx))
return result
@@ -174,41 +180,3 @@ class LiveRewriter(object):
buff = ''.join(gen)
return (status_headers, buff)
-
-
-#=================================================================
-def main(): # pragma: no cover
- import sys
-
- if len(sys.argv) < 2:
- msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
- print msg.format(sys.argv[0])
- return 1
- else:
- url = sys.argv[1]
-
- if len(sys.argv) >= 3:
- wburl_str = sys.argv[2]
- if wburl_str.startswith('/'):
- wburl_str = wburl_str[1:]
-
- prefix, wburl_str = wburl_str.split('/', 1)
- prefix = '/' + prefix + '/'
- else:
- wburl_str = (datetime_to_timestamp(datetime.datetime.now()) +
- '/http://example.com/path/sample.html')
- prefix = '/pywb_rewrite/'
-
- urlrewriter = UrlRewriter(wburl_str, prefix)
-
- liverewriter = LiveRewriter()
-
- status_headers, buff = liverewriter.get_rewritten(url, urlrewriter)
-
- sys.stdout.write(buff)
- return 0
-
-
-#=================================================================
-if __name__ == "__main__":
- exit(main())
diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py
index 3cd9ad72..f826108f 100644
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@@ -196,8 +196,11 @@ class WbUrl(BaseWbUrl):
@property
def is_embed(self):
return (self.mod and
- self.mod != 'id_' and
- self.mod != 'mp_')
+ self.mod not in ('id_', 'mp_', 'bn_'))
+
+ @property
+ def is_banner_only(self):
+ return (self.mod == 'bn_')
@property
def is_identity(self):
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index be810823..b1ff4a26 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -1,5 +1,5 @@
-{% if rule.js_rewrite_location %}
+{% if rule.js_rewrite_location and include_wombat %}
diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py
index 6228de3e..ce30793d 100644
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@@ -115,6 +115,14 @@ class StaticHandler(BaseHandler):
try:
data = self.block_loader.load(full_path)
+ try:
+ data.seek(0, 2)
+ size = data.tell()
+ data.seek(0)
+ headers = [('Content-Length', str(size))]
+ except IOError:
+ headers = None
+
if 'wsgi.file_wrapper' in wbrequest.env:
reader = wbrequest.env['wsgi.file_wrapper'](data)
else:
@@ -122,7 +130,9 @@ class StaticHandler(BaseHandler):
content_type, _ = mimetypes.guess_type(full_path)
- return WbResponse.text_stream(data, content_type=content_type)
+ return WbResponse.text_stream(data,
+ content_type=content_type,
+ headers=headers)
except IOError:
raise NotFoundException('Static File Not Found: ' +
diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py
index c4e0f4f3..2542aee2 100644
--- a/pywb/webapp/replay_views.py
+++ b/pywb/webapp/replay_views.py
@@ -99,8 +99,8 @@ class RewriteLiveView(BaseContentView):
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
- url = wbrequest.wb_url.url
- result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
+ wb_url = wbrequest.wb_url
+ result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)
@@ -211,14 +211,13 @@ class ReplayView(BaseContentView):
create_insert_func(wbrequest))
result = (self.content_rewriter.
- rewrite_content(urlrewriter,
+ rewrite_content(wbrequest.wb_url,
+ urlrewriter,
headers=status_headers,
stream=stream,
head_insert_func=head_insert_func,
urlkey=cdx['urlkey'],
- sanitize_only=wbrequest.wb_url.is_identity,
- cdx=cdx,
- mod=wbrequest.wb_url.mod))
+ cdx=cdx))
(status_headers, response_iter, is_rewritten) = result
diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py
index c49be8c9..0fc5589d 100644
--- a/pywb/webapp/views.py
+++ b/pywb/webapp/views.py
@@ -121,16 +121,18 @@ def add_env_globals(glb):
#=================================================================
class HeadInsertView(J2TemplateView):
- def create_insert_func(self, wbrequest, include_ts=True):
+ def create_insert_func(self, wbrequest,
+ include_ts=True):
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
- include_ts = include_ts
+ include_wombat = not wbrequest.wb_url.is_banner_only
def make_head_insert(rule, cdx):
return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx,
canon_url=canon_url,
include_ts=include_ts,
+ include_wombat=include_wombat,
rule=rule))
return make_head_insert
diff --git a/setup.py b/setup.py
index 3e89abed..a6e9c885 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
- version='0.5.1',
+ version='0.5.2',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 94ce45cf..456d50f8 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -98,6 +98,7 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
+ assert 'WB_wombat_init' in resp.body
assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self):
@@ -141,6 +142,19 @@ class TestWb:
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
+ def test_replay_banner_only(self):
+ resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
+
+ # wb.js header insertion
+ assert 'wb.js' in resp.body
+
+ # no wombat present
+ assert 'WB_wombat_init' not in resp.body
+
+ # url not rewritten
+ #assert '"http://www.iana.org/domains/example"' in resp.body
+ assert '"/_css/2013.1/screen.css"' in resp.body
+
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')