diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index da456474..a6f1908b 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -179,13 +179,6 @@ class WbResponse(object):
return WbResponse(StatusAndHeaders(status, redir_headers))
def __call__(self, env, start_response):
-
- # PERF
- perfstats = env.get('X_PERF')
- if perfstats:
- self.status_headers.headers.append(('X-Archive-Perf-Stats',
- str(perfstats)))
-
start_response(self.status_headers.statusline,
self.status_headers.headers)
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index 08b1e997..5a316016 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -101,12 +101,9 @@ class HTMLRewriterMixin(object):
if not m:
return meta_refresh
- try:
- meta_refresh = (meta_refresh[:m.start(1)] +
- self._rewrite_url(m.group(1)) +
- meta_refresh[m.end(1):])
- except Exception:
- pass
+ meta_refresh = (meta_refresh[:m.start(1)] +
+ self._rewrite_url(m.group(1)) +
+ meta_refresh[m.end(1):])
return meta_refresh
# ===========================
@@ -136,7 +133,7 @@ class HTMLRewriterMixin(object):
return value.lower() == attr_value.lower()
return False
- def _rewrite_tag_attrs(self, tag, tag_attrs, escape=False):
+ def _rewrite_tag_attrs(self, tag, tag_attrs):
# special case: script or style parse context
if ((tag in self.STATE_TAGS) and not self._wb_parse_context):
self._wb_parse_context = tag
@@ -197,7 +194,7 @@ class HTMLRewriterMixin(object):
rebase_rewriter(attr_value))
# write the attr!
- self._write_attr(attr_name, attr_value, escape=escape)
+ self._write_attr(attr_name, attr_value)
return True
@@ -217,12 +214,10 @@ class HTMLRewriterMixin(object):
return True
- def _write_attr(self, name, value, escape=False):
+ def _write_attr(self, name, value):
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
if value:
- if escape:
- value = cgi.escape(value, quote=True)
self.out.write(' ' + name + '="' + value + '"')
else:
self.out.write(' ' + name + '=""')
@@ -259,8 +254,8 @@ class HTMLRewriterMixin(object):
return result
- def _internal_close(self):
- pass
+ def _internal_close(self): # pragma: no cover
+ raise NotImplementedError('Base method')
#=================================================================
@@ -272,7 +267,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def feed(self, string):
try:
HTMLParser.feed(self, string)
- except HTMLParseError:
+ except HTMLParseError: # pragma: no cover
+ # only raised in 2.6
self.out.write(string)
def _internal_close(self):
@@ -283,7 +279,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
try:
HTMLParser.close(self)
- except HTMLParseError:
+ except HTMLParseError: # pragma: no cover
+ # only raised in 2.6
pass
# called to unescape attrs -- do not unescape!
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 021f76b6..2225bbaf 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -58,10 +58,12 @@ class RewriteContent:
return (rewritten_headers, stream)
- def rewrite_content(self, wb_url, urlrewriter, headers, stream,
+ def rewrite_content(self, urlrewriter, headers, stream,
head_insert_func=None, urlkey='',
cdx=None):
+ wb_url = urlrewriter.wburl
+
if (wb_url.is_identity or
(not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(headers, stream)
@@ -109,16 +111,6 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
- #if self.decode_stream:
- # if rewritten_headers.charset:
- # encoding = rewritten_headers.charset
- # else:
- # (encoding, first_buff) = self._detect_charset(stream)
-
- # if encoding not set or chardet thinks its ascii, use utf-8
- # if not encoding or encoding == 'ascii':
- # encoding = 'utf-8'
-
rule = self.ruleset.get_first_match(urlkey)
rewriter_class = rule.rewriters[text_type]
@@ -149,8 +141,11 @@ class RewriteContent:
rewriter = rewriter_class(urlrewriter)
# Create rewriting generator
- gen = self._rewriting_stream_gen(rewriter, encoding,
- stream, first_buff)
+ gen = self.stream_to_gen(stream,
+ rewrite_func=rewriter.rewrite,
+ final_read_func=rewriter.close,
+ first_buff=first_buff)
+
return (status_headers, gen, True)
@@ -179,32 +174,6 @@ class RewriteContent:
for buff in self.stream_to_gen(stream):
yield buff
-
- # Create rewrite stream, may even be chunked by front-end
- def _rewriting_stream_gen(self, rewriter, encoding,
- stream, first_buff=None):
-
- def do_rewrite(buff):
- if encoding:
- buff = self._decode_buff(buff, stream, encoding)
- buff = rewriter.rewrite(buff)
- if encoding:
- buff = buff.encode(encoding)
-
- return buff
-
- def do_finish():
- result = rewriter.close()
- if encoding:
- result = result.encode(encoding)
-
- return result
-
- return self.stream_to_gen(stream,
- rewrite_func=do_rewrite,
- final_read_func=do_finish,
- first_buff=first_buff)
-
@staticmethod
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
@@ -223,26 +192,6 @@ class RewriteContent:
return buff
- def _detect_charset(self, stream): # pragma: no coverage
- full_buff = stream.read(8192)
- io_buff = BytesIO(full_buff)
-
- detector = UniversalDetector()
-
- try:
- buff = io_buff.read(256)
- while buff:
- detector.feed(buff)
- if detector.done:
- break
-
- buff = io_buff.read(256)
- finally:
- detector.close()
-
- print "chardet result: ", str(detector.result)
- return (detector.result['encoding'], full_buff)
-
# Create a generator reading from a stream,
# with optional rewriting and final read call
@staticmethod
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index cbd3f106..be891498 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -9,7 +9,7 @@ import logging
from urlparse import urlsplit
-from pywb.utils.loaders import is_http, LimitReader
+from pywb.utils.loaders import is_http, LimitReader, BlockLoader
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
@@ -30,7 +30,8 @@ class LiveRewriter(object):
logging.debug('Live Rewrite Direct (no proxy)')
def fetch_local_file(self, uri):
- fh = open(uri)
+ #fh = open(uri)
+ fh = BlockLoader().load_file_or_resource(uri)
content_type, _ = mimetypes.guess_type(uri)
@@ -118,7 +119,7 @@ class LiveRewriter(object):
return (status_headers, stream)
- def fetch_request(self, wb_url, urlrewriter,
+ def fetch_request(self, url, urlrewriter,
head_insert_func=None,
urlkey=None,
env=None,
@@ -127,15 +128,11 @@ class LiveRewriter(object):
follow_redirects=False,
proxies=None):
- if isinstance(wb_url, str):
- url = wb_url
- wb_url = WbUrl(url)
- else:
- url = wb_url.url
-
ts_err = url.split('///')
- if len(ts_err) > 1:
+ # fixup for accidental erroneous rewrite which has ///
+ # (unless file:///)
+ if len(ts_err) > 1 and ts_err[0] != 'file:':
url = 'http://' + ts_err[1]
if url.startswith('//'):
@@ -164,8 +161,7 @@ class LiveRewriter(object):
}
result = (self.rewriter.
- rewrite_content(wb_url,
- urlrewriter,
+ rewrite_content(urlrewriter,
status_headers,
stream,
head_insert_func=head_insert_func,
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index ae9b24e2..9ea8edc0 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -99,6 +99,7 @@ ur"""
>>> parse('
SomeTest
', head_insert = '')
SomeTest
+# doctype
>>> parse('')
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index 24f76da1..fcb51ea3 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -1,5 +1,6 @@
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.wburl import WbUrl
from pywb import get_test_dir
@@ -9,6 +10,7 @@ from io import BytesIO
# As such, the content may change and the test may break
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
+bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule, cdx):
if rule.js_rewrite_location == True:
@@ -33,6 +35,51 @@ def test_local_1():
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+def test_local_no_head():
+ status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
+ urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location rewritten
+ assert 'window.WB_wombat_location = "/other.html"' in buff
+
+ # link rewritten
+ assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+def test_local_no_head_banner_only():
+ status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
+ bn_urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location NOT rewritten
+ assert 'window.location = "/other.html"' in buff
+
+ # link NOT rewritten
+ assert '"another.html"' in buff
+
+def test_local_banner_only():
+ status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
+ bn_urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location NOT rewritten
+ assert 'window.location = "/other.html"' in buff
+
+ # link NOT rewritten
+ assert '"another.html"' in buff
+
def test_local_2_no_js_location_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
@@ -76,8 +123,7 @@ def test_example_4_rewrite_err():
assert status_headers.get_statuscode() == '200'
def test_example_domain_specific_3():
- urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
- status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True)
+ status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)
# comment out bootloader
assert '/* Bootloader.configurePage' in buff
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index a4173d3a..73340c95 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -65,6 +65,9 @@
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
+>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+'file:///some/path/'
+
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/'
@@ -73,10 +76,10 @@
# HttpsUrlRewriter tests
->>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc')
+>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
'http://example.com/abc'
->>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc')
+>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
'http://example.com/abc'
"""
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 2679b4dc..c89e9a21 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -13,7 +13,8 @@ class UrlRewriter(object):
instance and an optional full path prefix
"""
- NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
+ NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
+ 'mailto:', 'about:', 'file:']
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
@@ -125,7 +126,7 @@ class UrlRewriter(object):
#=================================================================
-class HttpsUrlRewriter(object):
+class HttpsUrlRewriter(UrlRewriter):
"""
A url rewriter which urls that start with https:// to http://
Other urls/input is unchanged.
@@ -134,9 +135,6 @@ class HttpsUrlRewriter(object):
HTTP = 'http://'
HTTPS = 'https://'
- def __init__(self, wburl, prefix, full_prefix=None):
- pass
-
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
result = self.HTTP + url[len(self.HTTPS):]
diff --git a/pywb/ui/query.html b/pywb/ui/query.html
index c78e1b49..2d1f5c86 100644
--- a/pywb/ui/query.html
+++ b/pywb/ui/query.html
@@ -1,3 +1,28 @@
+
+
+
+
pywb Sample Calendar Results
{{ cdx_lines | length }} captures of {{ url }}
@@ -10,7 +35,9 @@
{% for cdx in cdx_lines %}
- {{ cdx['timestamp'] | format_ts}} |
+
+
+ |
{{ cdx['statuscode'] }} |
{{ cdx['original'] }} |
{{ cdx['filename'] }} |
@@ -21,3 +48,4 @@
* Unique captures are bold. Other captures are duplicates of a previous capture.
+
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 6b383493..107379a2 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -96,7 +96,7 @@ class BlockLoader(object):
else:
return self.load_file_or_resource(url, offset, length)
- def load_file_or_resource(self, url, offset, length):
+ def load_file_or_resource(self, url, offset=0, length=-1):
"""
Load a file-like reader from the local file system
"""
diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py
index b64f2419..322b9169 100644
--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@@ -1,5 +1,5 @@
#=================================================================
-"""
+r"""
# LimitReader Tests
>>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji'
@@ -32,10 +32,14 @@ True
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain'
-# fixed cookie
+# fixed cookie, range request
>>> BlockLoader('some=value').load('http://example.com', 41, 14).read()
'Example Domain'
+# range request
+>>> BlockLoader().load('http://example.com', 1262).read()
+'\n'
+
# test with extra id, ensure 4 parts of the A-B=C-D form are present
>>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
4
diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py
index a1b602d4..cb279beb 100644
--- a/pywb/webapp/live_rewrite_handler.py
+++ b/pywb/webapp/live_rewrite_handler.py
@@ -38,6 +38,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
return self.render_content(wbrequest)
except Exception as exc:
+ import traceback
+ err_details = traceback.format_exc(exc)
+ print err_details
+
url = wbrequest.wb_url.url
msg = 'Could not load the url from the live web: ' + url
raise LiveResourceException(msg=msg, url=url)
@@ -53,8 +57,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
- wb_url = wbrequest.wb_url
- result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
+ result = self.rewriter.fetch_request(wbrequest.wb_url.url,
+ wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env)
diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py
index 5002a18d..9f32ad5d 100644
--- a/pywb/webapp/replay_views.py
+++ b/pywb/webapp/replay_views.py
@@ -130,8 +130,7 @@ class ReplayView(object):
create_insert_func(wbrequest))
result = (self.content_rewriter.
- rewrite_content(wbrequest.wb_url,
- urlrewriter,
+ rewrite_content(urlrewriter,
headers=status_headers,
stream=stream,
head_insert_func=head_insert_func,
diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html
new file mode 100644
index 00000000..ed4bc4f3
--- /dev/null
+++ b/sample_archive/text_content/sample_no_head.html
@@ -0,0 +1,8 @@
+
+Test Content
+Some Link
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 5723425e..67bf698b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -301,6 +301,11 @@ class TestWb:
assert resp.status_int == 200
assert '"data": "^"' in resp.body
+ def test_post_invalid(self):
+ # not json
+ resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
+ assert resp.status_int == 404
+
def test_post_redirect(self):
# post handled without redirect (since 307 not allowed)
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')])
@@ -308,7 +313,6 @@ class TestWb:
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body
-
def test_excluded_content(self):
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403
diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py
index 5ce19414..331eaa69 100644
--- a/tests/test_live_rewriter.py
+++ b/tests/test_live_rewriter.py
@@ -17,6 +17,13 @@ class TestLiveRewriter:
resp = self.testapp.get('/rewrite/mp_/http://facebook.com/')
assert resp.status_int == 301
+ def test_live_rewrite_post(self):
+ resp = self.testapp.post('/rewrite/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
+ assert resp.status_int == 200
+ assert '"foo": "bar"' in resp.body
+ assert '"test": "abc"' in resp.body
+ assert resp.status_int == 200
+
def test_live_rewrite_frame(self):
resp = self.testapp.get('/rewrite/http://example.com/')
assert resp.status_int == 200