mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: add test for banner-only mode, rewriting w/o a head using local
'sample_no_head' file. query.html: use client side rewriting for calendar dates rewrite: remove unused decode stuff
This commit is contained in:
parent
72fe274e06
commit
71e8ada57d
@ -105,16 +105,6 @@ class RewriteContent:
|
||||
else:
|
||||
stream = DecompressingBufferedReader(stream)
|
||||
|
||||
#if self.decode_stream:
|
||||
# if rewritten_headers.charset:
|
||||
# encoding = rewritten_headers.charset
|
||||
# else:
|
||||
# (encoding, first_buff) = self._detect_charset(stream)
|
||||
|
||||
# if encoding not set or chardet thinks its ascii, use utf-8
|
||||
# if not encoding or encoding == 'ascii':
|
||||
# encoding = 'utf-8'
|
||||
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
rewriter_class = rule.rewriters[text_type]
|
||||
@ -145,8 +135,11 @@ class RewriteContent:
|
||||
rewriter = rewriter_class(urlrewriter)
|
||||
|
||||
# Create rewriting generator
|
||||
gen = self._rewriting_stream_gen(rewriter, encoding,
|
||||
stream, first_buff)
|
||||
gen = self.stream_to_gen(stream,
|
||||
rewrite_func=rewriter.rewrite,
|
||||
final_read_func=rewriter.close,
|
||||
first_buff=first_buff)
|
||||
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
@ -175,32 +168,6 @@ class RewriteContent:
|
||||
for buff in self.stream_to_gen(stream):
|
||||
yield buff
|
||||
|
||||
|
||||
# Create rewrite stream, may even be chunked by front-end
|
||||
def _rewriting_stream_gen(self, rewriter, encoding,
|
||||
stream, first_buff=None):
|
||||
|
||||
def do_rewrite(buff):
|
||||
if encoding:
|
||||
buff = self._decode_buff(buff, stream, encoding)
|
||||
buff = rewriter.rewrite(buff)
|
||||
if encoding:
|
||||
buff = buff.encode(encoding)
|
||||
|
||||
return buff
|
||||
|
||||
def do_finish():
|
||||
result = rewriter.close()
|
||||
if encoding:
|
||||
result = result.encode(encoding)
|
||||
|
||||
return result
|
||||
|
||||
return self.stream_to_gen(stream,
|
||||
rewrite_func=do_rewrite,
|
||||
final_read_func=do_finish,
|
||||
first_buff=first_buff)
|
||||
|
||||
@staticmethod
|
||||
def _decode_buff(buff, stream, encoding): # pragma: no coverage
|
||||
try:
|
||||
@ -219,26 +186,6 @@ class RewriteContent:
|
||||
|
||||
return buff
|
||||
|
||||
def _detect_charset(self, stream): # pragma: no coverage
|
||||
full_buff = stream.read(8192)
|
||||
io_buff = BytesIO(full_buff)
|
||||
|
||||
detector = UniversalDetector()
|
||||
|
||||
try:
|
||||
buff = io_buff.read(256)
|
||||
while buff:
|
||||
detector.feed(buff)
|
||||
if detector.done:
|
||||
break
|
||||
|
||||
buff = io_buff.read(256)
|
||||
finally:
|
||||
detector.close()
|
||||
|
||||
print "chardet result: ", str(detector.result)
|
||||
return (detector.result['encoding'], full_buff)
|
||||
|
||||
# Create a generator reading from a stream,
|
||||
# with optional rewriting and final read call
|
||||
@staticmethod
|
||||
|
@ -9,7 +9,7 @@ import logging
|
||||
|
||||
from urlparse import urlsplit
|
||||
|
||||
from pywb.utils.loaders import is_http, LimitReader
|
||||
from pywb.utils.loaders import is_http, LimitReader, BlockLoader
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
@ -30,7 +30,8 @@ class LiveRewriter(object):
|
||||
logging.debug('Live Rewrite Direct (no proxy)')
|
||||
|
||||
def fetch_local_file(self, uri):
|
||||
fh = open(uri)
|
||||
#fh = open(uri)
|
||||
fh = BlockLoader().load_file_or_resource(uri)
|
||||
|
||||
content_type, _ = mimetypes.guess_type(uri)
|
||||
|
||||
@ -135,12 +136,14 @@ class LiveRewriter(object):
|
||||
|
||||
ts_err = url.split('///')
|
||||
|
||||
if len(ts_err) > 1:
|
||||
if len(ts_err) > 1 and ts_err[0] != 'file:':
|
||||
url = 'http://' + ts_err[1]
|
||||
|
||||
if url.startswith('//'):
|
||||
url = 'http:' + url
|
||||
|
||||
print 'URL ', url
|
||||
|
||||
if is_http(url):
|
||||
(status_headers, stream) = self.fetch_http(url, env, req_headers,
|
||||
follow_redirects,
|
||||
|
@ -99,6 +99,7 @@ ur"""
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html PUBLIC "public">')
|
||||
<!doctype html PUBLIC "public">
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
@ -33,6 +34,58 @@ def test_local_1():
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
|
||||
def test_local_no_head():
|
||||
wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
|
||||
status_headers, buff = get_rewritten(wb_url,
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/default/wombat.js"> </script>' in buff
|
||||
|
||||
# location rewritten
|
||||
assert 'window.WB_wombat_location = "/other.html"' in buff
|
||||
|
||||
# link rewritten
|
||||
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
|
||||
|
||||
def test_local_no_head_banner_only():
|
||||
wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample_no_head.html')
|
||||
wb_url.mod = 'bn_'
|
||||
|
||||
status_headers, buff = get_rewritten(wb_url,
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<script src="/static/default/wombat.js"> </script>' in buff
|
||||
|
||||
# location NOT rewritten
|
||||
assert 'window.location = "/other.html"' in buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
|
||||
def test_local_banner_only():
|
||||
wb_url = WbUrl('file://' + get_test_dir() + 'text_content/sample.html')
|
||||
wb_url.mod = 'bn_'
|
||||
|
||||
status_headers, buff = get_rewritten(wb_url,
|
||||
urlrewriter,
|
||||
head_insert_func,
|
||||
'com,example,test)/')
|
||||
|
||||
# wombat insert added
|
||||
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
||||
|
||||
# location NOT rewritten
|
||||
assert 'window.location = "/other.html"' in buff
|
||||
|
||||
# link NOT rewritten
|
||||
assert '"another.html"' in buff
|
||||
|
||||
def test_local_2_no_js_location_rewrite():
|
||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||
urlrewriter,
|
||||
|
@ -65,6 +65,9 @@
|
||||
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'mailto:example@example.com'
|
||||
|
||||
>>> do_rewrite('file:///some/path/', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
||||
'file:///some/path/'
|
||||
|
||||
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
|
||||
'/abc/19960708im_/'
|
||||
|
||||
|
@ -13,7 +13,8 @@ class UrlRewriter(object):
|
||||
instance and an optional full path prefix
|
||||
"""
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
|
||||
'mailto:', 'about:', 'file:']
|
||||
|
||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
|
@ -1,3 +1,28 @@
|
||||
<html>
|
||||
<head>
|
||||
<script>
|
||||
function ts_to_date(ts, is_gmt)
|
||||
{
|
||||
if (ts.length < 14) {
|
||||
return ts;
|
||||
}
|
||||
|
||||
var datestr = (ts.substring(0, 4) + "-" +
|
||||
ts.substring(4, 6) + "-" +
|
||||
ts.substring(6, 8) + "T" +
|
||||
ts.substring(8, 10) + ":" +
|
||||
ts.substring(10, 12) + ":" +
|
||||
ts.substring(12, 14) + "-00:00");
|
||||
|
||||
var date = new Date(datestr);
|
||||
if (is_gmt) {
|
||||
return date.toGMTString();
|
||||
} else {
|
||||
return date.toLocaleString();
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h2>pywb Sample Calendar Results</h2>
|
||||
<b>{{ cdx_lines | length }}</b> captures of <b>{{ url }}</b>
|
||||
@ -10,7 +35,9 @@
|
||||
</tr>
|
||||
{% for cdx in cdx_lines %}
|
||||
<tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
|
||||
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">{{ cdx['timestamp'] | format_ts}}</a></td>
|
||||
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">
|
||||
<script>document.write(ts_to_date("{{ cdx['timestamp']}}", true))</script>
|
||||
</a></td>
|
||||
<td>{{ cdx['statuscode'] }}</td>
|
||||
<td>{{ cdx['original'] }}</td>
|
||||
<td>{{ cdx['filename'] }}</td>
|
||||
@ -21,3 +48,4 @@
|
||||
<i><b>* Unique captures are bold.</b> Other captures are duplicates of a previous capture.</i>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -96,7 +96,7 @@ class BlockLoader(object):
|
||||
else:
|
||||
return self.load_file_or_resource(url, offset, length)
|
||||
|
||||
def load_file_or_resource(self, url, offset, length):
|
||||
def load_file_or_resource(self, url, offset=0, length=-1):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
8
sample_archive/text_content/sample_no_head.html
Normal file
8
sample_archive/text_content/sample_no_head.html
Normal file
@ -0,0 +1,8 @@
|
||||
<script>
|
||||
var some_val = false;
|
||||
if (some_val) {
|
||||
window.location = "/other.html";
|
||||
}
|
||||
</script>
|
||||
Test Content
|
||||
<a href="another.html">Some Link</a>
|
Loading…
x
Reference in New Issue
Block a user