diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 3a635d4e..3cbcd362 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -105,16 +105,6 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
- #if self.decode_stream:
- # if rewritten_headers.charset:
- # encoding = rewritten_headers.charset
- # else:
- # (encoding, first_buff) = self._detect_charset(stream)
-
- # if encoding not set or chardet thinks its ascii, use utf-8
- # if not encoding or encoding == 'ascii':
- # encoding = 'utf-8'
-
rule = self.ruleset.get_first_match(urlkey)
rewriter_class = rule.rewriters[text_type]
@@ -145,8 +135,11 @@ class RewriteContent:
rewriter = rewriter_class(urlrewriter)
# Create rewriting generator
- gen = self._rewriting_stream_gen(rewriter, encoding,
- stream, first_buff)
+ gen = self.stream_to_gen(stream,
+ rewrite_func=rewriter.rewrite,
+ final_read_func=rewriter.close,
+ first_buff=first_buff)
+
return (status_headers, gen, True)
@@ -175,32 +168,6 @@ class RewriteContent:
for buff in self.stream_to_gen(stream):
yield buff
-
- # Create rewrite stream, may even be chunked by front-end
- def _rewriting_stream_gen(self, rewriter, encoding,
- stream, first_buff=None):
-
- def do_rewrite(buff):
- if encoding:
- buff = self._decode_buff(buff, stream, encoding)
- buff = rewriter.rewrite(buff)
- if encoding:
- buff = buff.encode(encoding)
-
- return buff
-
- def do_finish():
- result = rewriter.close()
- if encoding:
- result = result.encode(encoding)
-
- return result
-
- return self.stream_to_gen(stream,
- rewrite_func=do_rewrite,
- final_read_func=do_finish,
- first_buff=first_buff)
-
@staticmethod
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
@@ -219,26 +186,6 @@ class RewriteContent:
return buff
- def _detect_charset(self, stream): # pragma: no coverage
- full_buff = stream.read(8192)
- io_buff = BytesIO(full_buff)
-
- detector = UniversalDetector()
-
- try:
- buff = io_buff.read(256)
- while buff:
- detector.feed(buff)
- if detector.done:
- break
-
- buff = io_buff.read(256)
- finally:
- detector.close()
-
- print "chardet result: ", str(detector.result)
- return (detector.result['encoding'], full_buff)
-
# Create a generator reading from a stream,
# with optional rewriting and final read call
@staticmethod
diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py
index cbd3f106..5d77ff52 100644
--- a/pywb/rewrite/rewrite_live.py
+++ b/pywb/rewrite/rewrite_live.py
@@ -9,7 +9,7 @@ import logging
from urlparse import urlsplit
-from pywb.utils.loaders import is_http, LimitReader
+from pywb.utils.loaders import is_http, LimitReader, BlockLoader
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
@@ -30,7 +30,8 @@ class LiveRewriter(object):
logging.debug('Live Rewrite Direct (no proxy)')
def fetch_local_file(self, uri):
- fh = open(uri)
+ #fh = open(uri)
+ fh = BlockLoader().load_file_or_resource(uri)
content_type, _ = mimetypes.guess_type(uri)
@@ -135,12 +136,14 @@ class LiveRewriter(object):
ts_err = url.split('///')
- if len(ts_err) > 1:
+ if len(ts_err) > 1 and ts_err[0] != 'file:':
url = 'http://' + ts_err[1]
if url.startswith('//'):
url = 'http:' + url
+ print 'URL ', url
+
if is_http(url):
(status_headers, stream) = self.fetch_http(url, env, req_headers,
follow_redirects,
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index ae9b24e2..9ea8edc0 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -99,6 +99,7 @@ ur"""
>>> parse('
SomeTest
', head_insert = '')
SomeTest
+# doctype
>>> parse('')
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
index 24f76da1..af25762b 100644
--- a/pywb/rewrite/test/test_rewrite_live.py
+++ b/pywb/rewrite/test/test_rewrite_live.py
@@ -1,5 +1,6 @@
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
+from pywb.rewrite.wburl import WbUrl
from pywb import get_test_dir
@@ -33,6 +34,58 @@ def test_local_1():
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+def test_local_no_head():
+ wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
+ status_headers, buff = get_rewritten(wb_url,
+ urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location rewritten
+ assert 'window.WB_wombat_location = "/other.html"' in buff
+
+ # link rewritten
+ assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
+
+def test_local_no_head_banner_only():
+ wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
+ wb_url.mod = 'bn_'
+
+ status_headers, buff = get_rewritten(wb_url,
+ urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location NOT rewritten
+ assert 'window.location = "/other.html"' in buff
+
+ # link NOT rewritten
+ assert '"another.html"' in buff
+
+def test_local_banner_only():
+ wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample.html')
+ wb_url.mod = 'bn_'
+
+ status_headers, buff = get_rewritten(wb_url,
+ urlrewriter,
+ head_insert_func,
+ 'com,example,test)/')
+
+ # wombat insert added
+ assert '' in buff
+
+ # location NOT rewritten
+ assert 'window.location = "/other.html"' in buff
+
+ # link NOT rewritten
+ assert '"another.html"' in buff
+
def test_local_2_no_js_location_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index a4173d3a..345c4faf 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -65,6 +65,9 @@
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
+>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
+'file:///some/path/'
+
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/'
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index d5593a22..d9b42c1b 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -13,7 +13,8 @@ class UrlRewriter(object):
instance and an optional full path prefix
"""
- NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
+ NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
+ 'mailto:', 'about:', 'file:']
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
diff --git a/pywb/ui/query.html b/pywb/ui/query.html
index c78e1b49..2d1f5c86 100644
--- a/pywb/ui/query.html
+++ b/pywb/ui/query.html
@@ -1,3 +1,28 @@
+
+
+
+
pywb Sample Calendar Results
{{ cdx_lines | length }} captures of {{ url }}
@@ -10,7 +35,9 @@
{% for cdx in cdx_lines %}
- {{ cdx['timestamp'] | format_ts}} |
+
+
+ |
{{ cdx['statuscode'] }} |
{{ cdx['original'] }} |
{{ cdx['filename'] }} |
@@ -21,3 +48,4 @@
* Unique captures are bold. Other captures are duplicates of a previous capture.
+
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 6b383493..107379a2 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -96,7 +96,7 @@ class BlockLoader(object):
else:
return self.load_file_or_resource(url, offset, length)
- def load_file_or_resource(self, url, offset, length):
+ def load_file_or_resource(self, url, offset=0, length=-1):
"""
Load a file-like reader from the local file system
"""
diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html
new file mode 100644
index 00000000..ed4bc4f3
--- /dev/null
+++ b/sample_archive/text_content/sample_no_head.html
@@ -0,0 +1,8 @@
+
+Test Content
+Some Link