1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge branch 'develop' into https-proxy for 'bn_' modifier support

This commit is contained in:
Ilya Kreymer 2014-07-29 12:26:50 -07:00
commit fc6ffc6c11
8 changed files with 98 additions and 63 deletions

View File

@ -1,6 +1,8 @@
#import chardet #import chardet
import pkgutil import pkgutil
import yaml import yaml
import re
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
from io import BytesIO from io import BytesIO
@ -52,11 +54,12 @@ class RewriteContent:
return (rewritten_headers, stream) return (rewritten_headers, stream)
def rewrite_content(self, urlrewriter, headers, stream, def rewrite_content(self, wb_url, urlrewriter, headers, stream,
head_insert_func=None, urlkey='', head_insert_func=None, urlkey='',
sanitize_only=False, cdx=None, mod=None): cdx=None):
if sanitize_only: if (wb_url.is_identity or
(not head_insert_func and wb_url.is_banner_only)):
status_headers, stream = self.sanitize_content(headers, stream) status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False) return (status_headers, self.stream_to_gen(stream), False)
@ -78,6 +81,8 @@ class RewriteContent:
# see known js/css modifier specified, the context should run # see known js/css modifier specified, the context should run
# default text_type # default text_type
mod = wb_url.mod
if mod == 'js_': if mod == 'js_':
text_type = 'js' text_type = 'js'
elif mod == 'cs_': elif mod == 'cs_':
@ -118,6 +123,10 @@ class RewriteContent:
if head_insert_func: if head_insert_func:
head_insert_str = head_insert_func(rule, cdx) head_insert_str = head_insert_func(rule, cdx)
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_str, stream)
return (status_headers, gen, False)
rewriter = rewriter_class(urlrewriter, rewriter = rewriter_class(urlrewriter,
js_rewriter_class=rule.rewriters['js'], js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'], css_rewriter_class=rule.rewriters['css'],
@ -125,7 +134,10 @@ class RewriteContent:
defmod=self.defmod) defmod=self.defmod)
else: else:
# apply one of (js, css, xml) rewriters if wb_url.is_banner_only:
return (status_headers, self.stream_to_gen(stream), False)
# apply one of (js, css, xml) rewriters
rewriter = rewriter_class(urlrewriter) rewriter = rewriter_class(urlrewriter)
# Create rewriting generator # Create rewriting generator
@ -134,6 +146,32 @@ class RewriteContent:
return (status_headers, gen, True) return (status_headers, gen, True)
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
def _head_insert_only_gen(self, insert_str, stream):
max_len = 1024
buff = ''
while max_len > 0:
curr = stream.read(max_len)
if not curr:
break
max_len -= len(buff)
buff += curr
matcher = self.HEAD_REGEX.search(buff)
if matcher:
yield buff[:matcher.end()] + insert_str
yield buff[matcher.end():]
else:
yield insert_str
yield buff
for buff in self.stream_to_gen(stream):
yield buff
# Create rewrite stream, may even be chunked by front-end # Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding, def _rewriting_stream_gen(self, rewriter, encoding,
stream, first_buff=None): stream, first_buff=None):

View File

@ -14,8 +14,9 @@ from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.url_rewriter import UrlRewriter from url_rewriter import UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent from wburl import WbUrl
from rewrite_content import RewriteContent
#================================================================= #=================================================================
@ -114,15 +115,20 @@ class LiveRewriter(object):
return (status_headers, stream) return (status_headers, stream)
def fetch_request(self, url, urlrewriter, def fetch_request(self, wb_url, urlrewriter,
head_insert_func=None, head_insert_func=None,
urlkey=None, urlkey=None,
env=None, env=None,
req_headers={}, req_headers={},
timestamp=None, timestamp=None,
follow_redirects=False, follow_redirects=False,
proxies=None, proxies=None):
mod=None):
if isinstance(wb_url, str):
url = wb_url
wb_url = WbUrl(url)
else:
url = wb_url.url
ts_err = url.split('///') ts_err = url.split('///')
@ -155,13 +161,13 @@ class LiveRewriter(object):
} }
result = (self.rewriter. result = (self.rewriter.
rewrite_content(urlrewriter, rewrite_content(wb_url,
urlrewriter,
status_headers, status_headers,
stream, stream,
head_insert_func=head_insert_func, head_insert_func=head_insert_func,
urlkey=urlkey, urlkey=urlkey,
cdx=cdx, cdx=cdx))
mod=mod))
return result return result
@ -174,41 +180,3 @@ class LiveRewriter(object):
buff = ''.join(gen) buff = ''.join(gen)
return (status_headers, buff) return (status_headers, buff)
#=================================================================
def main(): # pragma: no cover
import sys
if len(sys.argv) < 2:
msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
print msg.format(sys.argv[0])
return 1
else:
url = sys.argv[1]
if len(sys.argv) >= 3:
wburl_str = sys.argv[2]
if wburl_str.startswith('/'):
wburl_str = wburl_str[1:]
prefix, wburl_str = wburl_str.split('/', 1)
prefix = '/' + prefix + '/'
else:
wburl_str = (datetime_to_timestamp(datetime.datetime.now()) +
'/http://example.com/path/sample.html')
prefix = '/pywb_rewrite/'
urlrewriter = UrlRewriter(wburl_str, prefix)
liverewriter = LiveRewriter()
status_headers, buff = liverewriter.get_rewritten(url, urlrewriter)
sys.stdout.write(buff)
return 0
#=================================================================
if __name__ == "__main__":
exit(main())

View File

@ -196,8 +196,11 @@ class WbUrl(BaseWbUrl):
@property @property
def is_embed(self): def is_embed(self):
return (self.mod and return (self.mod and
self.mod != 'id_' and self.mod not in ('id_', 'mp_', 'bn_'))
self.mod != 'mp_')
@property
def is_banner_only(self):
return (self.mod == 'bn_')
@property @property
def is_identity(self): def is_identity(self):

View File

@ -1,5 +1,5 @@
<!-- WB Insert --> <!-- WB Insert -->
{% if rule.js_rewrite_location %} {% if rule.js_rewrite_location and include_wombat %}
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script> <script>
{% set urlsplit = cdx['original'] | urlsplit %} {% set urlsplit = cdx['original'] | urlsplit %}
@ -16,9 +16,10 @@
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}"; wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}}; wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}} wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}"; wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }}; wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
wbinfo.is_proxy_mode = {{ "true" if wbrequest.options.is_proxy else "false" }};
</script> </script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -115,6 +115,14 @@ class StaticHandler(BaseHandler):
try: try:
data = self.block_loader.load(full_path) data = self.block_loader.load(full_path)
try:
data.seek(0, 2)
size = data.tell()
data.seek(0)
headers = [('Content-Length', str(size))]
except IOError:
headers = None
if 'wsgi.file_wrapper' in wbrequest.env: if 'wsgi.file_wrapper' in wbrequest.env:
reader = wbrequest.env['wsgi.file_wrapper'](data) reader = wbrequest.env['wsgi.file_wrapper'](data)
else: else:
@ -122,7 +130,9 @@ class StaticHandler(BaseHandler):
content_type, _ = mimetypes.guess_type(full_path) content_type, _ = mimetypes.guess_type(full_path)
return WbResponse.text_stream(data, content_type=content_type) return WbResponse.text_stream(data,
content_type=content_type,
headers=headers)
except IOError: except IOError:
raise NotFoundException('Static File Not Found: ' + raise NotFoundException('Static File Not Found: ' +

View File

@ -99,8 +99,8 @@ class RewriteLiveView(BaseContentView):
if ref_wburl_str: if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
url = wbrequest.wb_url.url wb_url = wbrequest.wb_url
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter, result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
head_insert_func=head_insert_func, head_insert_func=head_insert_func,
env=wbrequest.env) env=wbrequest.env)
@ -211,14 +211,13 @@ class ReplayView(BaseContentView):
create_insert_func(wbrequest)) create_insert_func(wbrequest))
result = (self.content_rewriter. result = (self.content_rewriter.
rewrite_content(urlrewriter, rewrite_content(wbrequest.wb_url,
urlrewriter,
headers=status_headers, headers=status_headers,
stream=stream, stream=stream,
head_insert_func=head_insert_func, head_insert_func=head_insert_func,
urlkey=cdx['urlkey'], urlkey=cdx['urlkey'],
sanitize_only=wbrequest.wb_url.is_identity, cdx=cdx))
cdx=cdx,
mod=wbrequest.wb_url.mod))
(status_headers, response_iter, is_rewritten) = result (status_headers, response_iter, is_rewritten) = result

View File

@ -121,16 +121,18 @@ def add_env_globals(glb):
#================================================================= #=================================================================
class HeadInsertView(J2TemplateView): class HeadInsertView(J2TemplateView):
def create_insert_func(self, wbrequest, include_ts=True): def create_insert_func(self, wbrequest,
include_ts=True):
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='') canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
include_ts = include_ts include_wombat = not wbrequest.wb_url.is_banner_only
def make_head_insert(rule, cdx): def make_head_insert(rule, cdx):
return (self.render_to_string(wbrequest=wbrequest, return (self.render_to_string(wbrequest=wbrequest,
cdx=cdx, cdx=cdx,
canon_url=canon_url, canon_url=canon_url,
include_ts=include_ts, include_ts=include_ts,
include_wombat=include_wombat,
rule=rule)) rule=rule))
return make_head_insert return make_head_insert

View File

@ -98,6 +98,7 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert 'WB_wombat_init' in resp.body
assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self): def test_replay_non_frame_content(self):
@ -141,6 +142,19 @@ class TestWb:
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
def test_replay_banner_only(self):
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
# wb.js header insertion
assert 'wb.js' in resp.body
# no wombat present
assert 'WB_wombat_init' not in resp.body
# url not rewritten
#assert '"http://www.iana.org/domains/example"' in resp.body
assert '"/_css/2013.1/screen.css"' in resp.body
def test_replay_identity_1(self): def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')