diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index da456474..a6f1908b 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -179,13 +179,6 @@ class WbResponse(object): return WbResponse(StatusAndHeaders(status, redir_headers)) def __call__(self, env, start_response): - - # PERF - perfstats = env.get('X_PERF') - if perfstats: - self.status_headers.headers.append(('X-Archive-Perf-Stats', - str(perfstats))) - start_response(self.status_headers.statusline, self.status_headers.headers) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 08b1e997..5a316016 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -101,12 +101,9 @@ class HTMLRewriterMixin(object): if not m: return meta_refresh - try: - meta_refresh = (meta_refresh[:m.start(1)] + - self._rewrite_url(m.group(1)) + - meta_refresh[m.end(1):]) - except Exception: - pass + meta_refresh = (meta_refresh[:m.start(1)] + + self._rewrite_url(m.group(1)) + + meta_refresh[m.end(1):]) return meta_refresh # =========================== @@ -136,7 +133,7 @@ class HTMLRewriterMixin(object): return value.lower() == attr_value.lower() return False - def _rewrite_tag_attrs(self, tag, tag_attrs, escape=False): + def _rewrite_tag_attrs(self, tag, tag_attrs): # special case: script or style parse context if ((tag in self.STATE_TAGS) and not self._wb_parse_context): self._wb_parse_context = tag @@ -197,7 +194,7 @@ class HTMLRewriterMixin(object): rebase_rewriter(attr_value)) # write the attr! - self._write_attr(attr_name, attr_value, escape=escape) + self._write_attr(attr_name, attr_value) return True @@ -217,12 +214,10 @@ class HTMLRewriterMixin(object): return True - def _write_attr(self, name, value, escape=False): + def _write_attr(self, name, value): # parser doesn't differentiate between 'attr=""' and just 'attr' # 'attr=""' is more common, so use that form if value: - if escape: - value = cgi.escape(value, quote=True) self.out.write(' ' + name + '="' + value + '"') else: self.out.write(' ' + name + '=""') @@ -259,8 +254,8 @@ class HTMLRewriterMixin(object): return result - def _internal_close(self): - pass + def _internal_close(self): # pragma: no cover + raise NotImplementedError('Base method') #================================================================= @@ -272,7 +267,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def feed(self, string): try: HTMLParser.feed(self, string) - except HTMLParseError: + except HTMLParseError: # pragma: no cover + # only raised in 2.6 self.out.write(string) def _internal_close(self): @@ -283,7 +279,8 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): try: HTMLParser.close(self) - except HTMLParseError: + except HTMLParseError: # pragma: no cover + # only raised in 2.6 pass # called to unescape attrs -- do not unescape! diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 021f76b6..2225bbaf 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -58,10 +58,12 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, wb_url, urlrewriter, headers, stream, + def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey='', cdx=None): + wb_url = urlrewriter.wburl + if (wb_url.is_identity or (not head_insert_func and wb_url.is_banner_only)): status_headers, stream = self.sanitize_content(headers, stream) @@ -109,16 +111,6 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) - #if self.decode_stream: - # if rewritten_headers.charset: - # encoding = rewritten_headers.charset - # else: - # (encoding, first_buff) = self._detect_charset(stream) - - # if encoding not set or chardet thinks its ascii, use utf-8 - # if not encoding or encoding == 'ascii': - # encoding = 'utf-8' - rule = self.ruleset.get_first_match(urlkey) rewriter_class = rule.rewriters[text_type] @@ -149,8 +141,11 @@ class RewriteContent: rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, - stream, first_buff) + gen = self.stream_to_gen(stream, + rewrite_func=rewriter.rewrite, + final_read_func=rewriter.close, + first_buff=first_buff) + return (status_headers, gen, True) @@ -179,32 +174,6 @@ class RewriteContent: for buff in self.stream_to_gen(stream): yield buff - - # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, - stream, first_buff=None): - - def do_rewrite(buff): - if encoding: - buff = self._decode_buff(buff, stream, encoding) - buff = rewriter.rewrite(buff) - if encoding: - buff = buff.encode(encoding) - - return buff - - def do_finish(): - result = rewriter.close() - if encoding: - result = result.encode(encoding) - - return result - - return self.stream_to_gen(stream, - rewrite_func=do_rewrite, - final_read_func=do_finish, - first_buff=first_buff) - @staticmethod def _decode_buff(buff, stream, encoding): # pragma: no coverage try: @@ -223,26 +192,6 @@ class RewriteContent: return buff - def _detect_charset(self, stream): # pragma: no coverage - full_buff = stream.read(8192) - io_buff = BytesIO(full_buff) - - detector = UniversalDetector() - - try: - buff = io_buff.read(256) - while buff: - detector.feed(buff) - if detector.done: - break - - buff = io_buff.read(256) - finally: - detector.close() - - print "chardet result: ", str(detector.result) - return (detector.result['encoding'], full_buff) - # Create a generator reading from a stream, # with optional rewriting and final read call @staticmethod diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index cbd3f106..be891498 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -9,7 +9,7 @@ import logging from urlparse import urlsplit -from pywb.utils.loaders import is_http, LimitReader +from pywb.utils.loaders import is_http, LimitReader, BlockLoader from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize @@ -30,7 +30,8 @@ class LiveRewriter(object): logging.debug('Live Rewrite Direct (no proxy)') def fetch_local_file(self, uri): - fh = open(uri) + #fh = open(uri) + fh = BlockLoader().load_file_or_resource(uri) content_type, _ = mimetypes.guess_type(uri) @@ -118,7 +119,7 @@ class LiveRewriter(object): return (status_headers, stream) - def fetch_request(self, wb_url, urlrewriter, + def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, @@ -127,15 +128,11 @@ class LiveRewriter(object): follow_redirects=False, proxies=None): - if isinstance(wb_url, str): - url = wb_url - wb_url = WbUrl(url) - else: - url = wb_url.url - ts_err = url.split('///') - if len(ts_err) > 1: + # fixup for accidental erroneous rewrite which has /// + # (unless file:///) + if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): @@ -164,8 +161,7 @@ class LiveRewriter(object): } result = (self.rewriter. - rewrite_content(wb_url, - urlrewriter, + rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index ae9b24e2..9ea8edc0 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -99,6 +99,7 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
+# doctype >>> parse('') diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 24f76da1..fcb51ea3 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -1,5 +1,6 @@ from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.wburl import WbUrl from pywb import get_test_dir @@ -9,6 +10,7 @@ from io import BytesIO # As such, the content may change and the test may break urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') def head_insert_func(rule, cdx): if rule.js_rewrite_location == True: @@ -33,6 +35,51 @@ def test_local_1(): assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff +def test_local_no_head(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', + urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location rewritten + assert 'window.WB_wombat_location = "/other.html"' in buff + + # link rewritten + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + +def test_local_no_head_banner_only(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', + bn_urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location NOT rewritten + assert 'window.location = "/other.html"' in buff + + # link NOT rewritten + assert '"another.html"' in buff + +def test_local_banner_only(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + bn_urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + # location NOT rewritten + assert 'window.location = "/other.html"' in buff + + # link NOT rewritten + assert '"another.html"' in buff + def test_local_2_no_js_location_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, @@ -76,8 +123,7 @@ def test_example_4_rewrite_err(): assert status_headers.get_statuscode() == '200' def test_example_domain_specific_3(): - urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') - status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2, follow_redirects=True) + status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True) # comment out bootloader assert '/* Bootloader.configurePage' in buff diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index a4173d3a..73340c95 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -65,6 +65,9 @@ >>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'mailto:example@example.com' +>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') +'file:///some/path/' + >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() '/abc/19960708im_/' @@ -73,10 +76,10 @@ # HttpsUrlRewriter tests ->>> HttpsUrlRewriter(None, None).rewrite('https://example.com/abc') +>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc') 'http://example.com/abc' ->>> HttpsUrlRewriter(None, None).rewrite('http://example.com/abc') +>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc') 'http://example.com/abc' """ diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 2679b4dc..c89e9a21 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -13,7 +13,8 @@ class UrlRewriter(object): instance and an optional full path prefix """ - NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] + NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', + 'mailto:', 'about:', 'file:'] PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] @@ -125,7 +126,7 @@ class UrlRewriter(object): #================================================================= -class HttpsUrlRewriter(object): +class HttpsUrlRewriter(UrlRewriter): """ A url rewriter which urls that start with https:// to http:// Other urls/input is unchanged. @@ -134,9 +135,6 @@ class HttpsUrlRewriter(object): HTTP = 'http://' HTTPS = 'https://' - def __init__(self, wburl, prefix, full_prefix=None): - pass - def rewrite(self, url, mod=None): if url.startswith(self.HTTPS): result = self.HTTP + url[len(self.HTTPS):] diff --git a/pywb/ui/query.html b/pywb/ui/query.html index c78e1b49..2d1f5c86 100644 --- a/pywb/ui/query.html +++ b/pywb/ui/query.html @@ -1,3 +1,28 @@ + + + +

pywb Sample Calendar Results

{{ cdx_lines | length }} captures of {{ url }} @@ -10,7 +35,9 @@ {% for cdx in cdx_lines %} - {{ cdx['timestamp'] | format_ts}} + + + {{ cdx['statuscode'] }} {{ cdx['original'] }} {{ cdx['filename'] }} @@ -21,3 +48,4 @@ * Unique captures are bold. Other captures are duplicates of a previous capture.

+ diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6b383493..107379a2 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -96,7 +96,7 @@ class BlockLoader(object): else: return self.load_file_or_resource(url, offset, length) - def load_file_or_resource(self, url, offset, length): + def load_file_or_resource(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system """ diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index b64f2419..322b9169 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -1,5 +1,5 @@ #================================================================= -""" +r""" # LimitReader Tests >>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) 'abcdefghji' @@ -32,10 +32,14 @@ True >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' -# fixed cookie +# fixed cookie, range request >>> BlockLoader('some=value').load('http://example.com', 41, 14).read() 'Example Domain' +# range request +>>> BlockLoader().load('http://example.com', 1262).read() +'\n' + # test with extra id, ensure 4 parts of the A-B=C-D form are present >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra'))) 4 diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index a1b602d4..cb279beb 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -38,6 +38,10 @@ class RewriteHandler(SearchPageWbUrlHandler): return self.render_content(wbrequest) except Exception as exc: + import traceback + err_details = traceback.format_exc(exc) + print err_details + url = wbrequest.wb_url.url msg = 'Could not load the url from the live web: ' + url raise LiveResourceException(msg=msg, url=url) @@ -53,8 +57,8 @@ class RewriteHandler(SearchPageWbUrlHandler): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - wb_url = wbrequest.wb_url - result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter, + result = self.rewriter.fetch_request(wbrequest.wb_url.url, + wbrequest.urlrewriter, head_insert_func=head_insert_func, req_headers=req_headers, env=wbrequest.env) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 5002a18d..9f32ad5d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -130,8 +130,7 @@ class ReplayView(object): create_insert_func(wbrequest)) result = (self.content_rewriter. - rewrite_content(wbrequest.wb_url, - urlrewriter, + rewrite_content(urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html new file mode 100644 index 00000000..ed4bc4f3 --- /dev/null +++ b/sample_archive/text_content/sample_no_head.html @@ -0,0 +1,8 @@ + +Test Content +Some Link diff --git a/tests/test_integration.py b/tests/test_integration.py index 5723425e..67bf698b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -301,6 +301,11 @@ class TestWb: assert resp.status_int == 200 assert '"data": "^"' in resp.body + def test_post_invalid(self): + # not json + resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404) + assert resp.status_int == 404 + def test_post_redirect(self): # post handled without redirect (since 307 not allowed) resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')]) @@ -308,7 +313,6 @@ class TestWb: assert '"foo": "bar"' in resp.body assert '"test": "abc"' in resp.body - def test_excluded_content(self): resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_img/bookmark_icon.ico', status = 403) assert resp.status_int == 403 diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index 5ce19414..331eaa69 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -17,6 +17,13 @@ class TestLiveRewriter: resp = self.testapp.get('/rewrite/mp_/http://facebook.com/') assert resp.status_int == 301 + def test_live_rewrite_post(self): + resp = self.testapp.post('/rewrite/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'}) + assert resp.status_int == 200 + assert '"foo": "bar"' in resp.body + assert '"test": "abc"' in resp.body + assert resp.status_int == 200 + def test_live_rewrite_frame(self): resp = self.testapp.get('/rewrite/http://example.com/') assert resp.status_int == 200