From 71e8ada57d3e1ef884424ae49067bf2b68094f10 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 4 Aug 2014 20:45:02 -0700 Subject: [PATCH] rewrite: add test for banner-only mode, rewriting w/o a head using local 'sample_no_head' file. query.html: use client side rewriting for calendar dates rewrite: remove unused decode stuff --- pywb/rewrite/rewrite_content.py | 63 ++----------------- pywb/rewrite/rewrite_live.py | 9 ++- pywb/rewrite/test/test_html_rewriter.py | 1 + pywb/rewrite/test/test_rewrite_live.py | 53 ++++++++++++++++ pywb/rewrite/test/test_url_rewriter.py | 3 + pywb/rewrite/url_rewriter.py | 3 +- pywb/ui/query.html | 30 ++++++++- pywb/utils/loaders.py | 2 +- .../text_content/sample_no_head.html | 8 +++ 9 files changed, 108 insertions(+), 64 deletions(-) create mode 100644 sample_archive/text_content/sample_no_head.html diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 3a635d4e..3cbcd362 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -105,16 +105,6 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) - #if self.decode_stream: - # if rewritten_headers.charset: - # encoding = rewritten_headers.charset - # else: - # (encoding, first_buff) = self._detect_charset(stream) - - # if encoding not set or chardet thinks its ascii, use utf-8 - # if not encoding or encoding == 'ascii': - # encoding = 'utf-8' - rule = self.ruleset.get_first_match(urlkey) rewriter_class = rule.rewriters[text_type] @@ -145,8 +135,11 @@ class RewriteContent: rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, - stream, first_buff) + gen = self.stream_to_gen(stream, + rewrite_func=rewriter.rewrite, + final_read_func=rewriter.close, + first_buff=first_buff) + return (status_headers, gen, True) @@ -175,32 +168,6 @@ class RewriteContent: for buff in self.stream_to_gen(stream): yield buff - - # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, - stream, first_buff=None): - - def do_rewrite(buff): - if encoding: - buff = self._decode_buff(buff, stream, encoding) - buff = rewriter.rewrite(buff) - if encoding: - buff = buff.encode(encoding) - - return buff - - def do_finish(): - result = rewriter.close() - if encoding: - result = result.encode(encoding) - - return result - - return self.stream_to_gen(stream, - rewrite_func=do_rewrite, - final_read_func=do_finish, - first_buff=first_buff) - @staticmethod def _decode_buff(buff, stream, encoding): # pragma: no coverage try: @@ -219,26 +186,6 @@ class RewriteContent: return buff - def _detect_charset(self, stream): # pragma: no coverage - full_buff = stream.read(8192) - io_buff = BytesIO(full_buff) - - detector = UniversalDetector() - - try: - buff = io_buff.read(256) - while buff: - detector.feed(buff) - if detector.done: - break - - buff = io_buff.read(256) - finally: - detector.close() - - print "chardet result: ", str(detector.result) - return (detector.result['encoding'], full_buff) - # Create a generator reading from a stream, # with optional rewriting and final read call @staticmethod diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index cbd3f106..5d77ff52 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -9,7 +9,7 @@ import logging from urlparse import urlsplit -from pywb.utils.loaders import is_http, LimitReader +from pywb.utils.loaders import is_http, LimitReader, BlockLoader from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize @@ -30,7 +30,8 @@ class LiveRewriter(object): logging.debug('Live Rewrite Direct (no proxy)') def fetch_local_file(self, uri): - fh = open(uri) + #fh = open(uri) + fh = BlockLoader().load_file_or_resource(uri) content_type, _ = mimetypes.guess_type(uri) @@ -135,12 +136,14 @@ class LiveRewriter(object): ts_err = url.split('///') - if len(ts_err) > 1: + if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url + print 'URL ', url + if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index ae9b24e2..9ea8edc0 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -99,6 +99,7 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
+# doctype >>> parse('') diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 24f76da1..af25762b 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,5 +1,6 @@ from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.wburl import WbUrl from pywb import get_test_dir @@ -33,6 +34,58 @@ def test_local_1(): assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff +def test_local_no_head(): + wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html') + status_headers, buff = get_rewritten(wb_url, + urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + +def test_local_no_head_banner_only(): + wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html') + wb_url.mod = 'bn_' + + status_headers, buff = get_rewritten(wb_url, + urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location NOT rewritten + assert 'window.location = "/other.html"' in buff + + # link NOT rewritten + assert '"another.html"' in buff + +def test_local_banner_only(): + wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample.html') + wb_url.mod = 'bn_' + + status_headers, buff = get_rewritten(wb_url, + urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location NOT rewritten + assert 'window.location = "/other.html"' in buff + + # link NOT rewritten + assert '"another.html"' in buff + def test_local_2_no_js_location_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index a4173d3a..345c4faf 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -65,6 +65,9 @@ >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'mailto:example@example.com' +>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') +'file:///some/path/' + >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() '/abc/19960708im_/' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index d5593a22..d9b42c1b 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -13,7 +13,8 @@ class UrlRewriter(object): instance and an optional full path prefix """ - NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] + NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', + 'mailto:', 'about:', 'file:'] PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] diff --git a/pywb/ui/query.html b/pywb/ui/query.html index c78e1b49..2d1f5c86 100644 --- a/pywb/ui/query.html +++ b/pywb/ui/query.html @@ -1,3 +1,28 @@ + + + +

pywb Sample Calendar Results

{{ cdx_lines | length }} captures of {{ url }} @@ -10,7 +35,9 @@ {% for cdx in cdx_lines %} - {{ cdx['timestamp'] | format_ts}} + + + {{ cdx['statuscode'] }} {{ cdx['original'] }} {{ cdx['filename'] }} @@ -21,3 +48,4 @@ * Unique captures are bold. Other captures are duplicates of a previous capture.

+ diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6b383493..107379a2 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -96,7 +96,7 @@ class BlockLoader(object): else: return self.load_file_or_resource(url, offset, length) - def load_file_or_resource(self, url, offset, length): + def load_file_or_resource(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system """ diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html new file mode 100644 index 00000000..ed4bc4f3 --- /dev/null +++ b/sample_archive/text_content/sample_no_head.html @@ -0,0 +1,8 @@ + +Test Content +Some Link