mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'develop' into https-proxy for 'bn_' modifier support
This commit is contained in:
commit
fc6ffc6c11
@ -1,6 +1,8 @@
|
|||||||
#import chardet
|
#import chardet
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import yaml
|
import yaml
|
||||||
|
import re
|
||||||
|
|
||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
@ -52,11 +54,12 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (rewritten_headers, stream)
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, headers, stream,
|
def rewrite_content(self, wb_url, urlrewriter, headers, stream,
|
||||||
head_insert_func=None, urlkey='',
|
head_insert_func=None, urlkey='',
|
||||||
sanitize_only=False, cdx=None, mod=None):
|
cdx=None):
|
||||||
|
|
||||||
if sanitize_only:
|
if (wb_url.is_identity or
|
||||||
|
(not head_insert_func and wb_url.is_banner_only)):
|
||||||
status_headers, stream = self.sanitize_content(headers, stream)
|
status_headers, stream = self.sanitize_content(headers, stream)
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
return (status_headers, self.stream_to_gen(stream), False)
|
||||||
|
|
||||||
@ -78,6 +81,8 @@ class RewriteContent:
|
|||||||
|
|
||||||
# see known js/css modifier specified, the context should run
|
# see known js/css modifier specified, the context should run
|
||||||
# default text_type
|
# default text_type
|
||||||
|
mod = wb_url.mod
|
||||||
|
|
||||||
if mod == 'js_':
|
if mod == 'js_':
|
||||||
text_type = 'js'
|
text_type = 'js'
|
||||||
elif mod == 'cs_':
|
elif mod == 'cs_':
|
||||||
@ -118,6 +123,10 @@ class RewriteContent:
|
|||||||
if head_insert_func:
|
if head_insert_func:
|
||||||
head_insert_str = head_insert_func(rule, cdx)
|
head_insert_str = head_insert_func(rule, cdx)
|
||||||
|
|
||||||
|
if wb_url.is_banner_only:
|
||||||
|
gen = self._head_insert_only_gen(head_insert_str, stream)
|
||||||
|
return (status_headers, gen, False)
|
||||||
|
|
||||||
rewriter = rewriter_class(urlrewriter,
|
rewriter = rewriter_class(urlrewriter,
|
||||||
js_rewriter_class=rule.rewriters['js'],
|
js_rewriter_class=rule.rewriters['js'],
|
||||||
css_rewriter_class=rule.rewriters['css'],
|
css_rewriter_class=rule.rewriters['css'],
|
||||||
@ -125,7 +134,10 @@ class RewriteContent:
|
|||||||
defmod=self.defmod)
|
defmod=self.defmod)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# apply one of (js, css, xml) rewriters
|
if wb_url.is_banner_only:
|
||||||
|
return (status_headers, self.stream_to_gen(stream), False)
|
||||||
|
|
||||||
|
# apply one of (js, css, xml) rewriters
|
||||||
rewriter = rewriter_class(urlrewriter)
|
rewriter = rewriter_class(urlrewriter)
|
||||||
|
|
||||||
# Create rewriting generator
|
# Create rewriting generator
|
||||||
@ -134,6 +146,32 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (status_headers, gen, True)
|
return (status_headers, gen, True)
|
||||||
|
|
||||||
|
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
|
||||||
|
|
||||||
|
def _head_insert_only_gen(self, insert_str, stream):
|
||||||
|
max_len = 1024
|
||||||
|
buff = ''
|
||||||
|
while max_len > 0:
|
||||||
|
curr = stream.read(max_len)
|
||||||
|
if not curr:
|
||||||
|
break
|
||||||
|
|
||||||
|
max_len -= len(buff)
|
||||||
|
buff += curr
|
||||||
|
|
||||||
|
matcher = self.HEAD_REGEX.search(buff)
|
||||||
|
|
||||||
|
if matcher:
|
||||||
|
yield buff[:matcher.end()] + insert_str
|
||||||
|
yield buff[matcher.end():]
|
||||||
|
else:
|
||||||
|
yield insert_str
|
||||||
|
yield buff
|
||||||
|
|
||||||
|
for buff in self.stream_to_gen(stream):
|
||||||
|
yield buff
|
||||||
|
|
||||||
|
|
||||||
# Create rewrite stream, may even be chunked by front-end
|
# Create rewrite stream, may even be chunked by front-end
|
||||||
def _rewriting_stream_gen(self, rewriter, encoding,
|
def _rewriting_stream_gen(self, rewriter, encoding,
|
||||||
stream, first_buff=None):
|
stream, first_buff=None):
|
||||||
|
@ -14,8 +14,9 @@ from pywb.utils.timeutils import datetime_to_timestamp
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from wburl import WbUrl
|
||||||
|
from rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -114,15 +115,20 @@ class LiveRewriter(object):
|
|||||||
|
|
||||||
return (status_headers, stream)
|
return (status_headers, stream)
|
||||||
|
|
||||||
def fetch_request(self, url, urlrewriter,
|
def fetch_request(self, wb_url, urlrewriter,
|
||||||
head_insert_func=None,
|
head_insert_func=None,
|
||||||
urlkey=None,
|
urlkey=None,
|
||||||
env=None,
|
env=None,
|
||||||
req_headers={},
|
req_headers={},
|
||||||
timestamp=None,
|
timestamp=None,
|
||||||
follow_redirects=False,
|
follow_redirects=False,
|
||||||
proxies=None,
|
proxies=None):
|
||||||
mod=None):
|
|
||||||
|
if isinstance(wb_url, str):
|
||||||
|
url = wb_url
|
||||||
|
wb_url = WbUrl(url)
|
||||||
|
else:
|
||||||
|
url = wb_url.url
|
||||||
|
|
||||||
ts_err = url.split('///')
|
ts_err = url.split('///')
|
||||||
|
|
||||||
@ -155,13 +161,13 @@ class LiveRewriter(object):
|
|||||||
}
|
}
|
||||||
|
|
||||||
result = (self.rewriter.
|
result = (self.rewriter.
|
||||||
rewrite_content(urlrewriter,
|
rewrite_content(wb_url,
|
||||||
|
urlrewriter,
|
||||||
status_headers,
|
status_headers,
|
||||||
stream,
|
stream,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
urlkey=urlkey,
|
urlkey=urlkey,
|
||||||
cdx=cdx,
|
cdx=cdx))
|
||||||
mod=mod))
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -174,41 +180,3 @@ class LiveRewriter(object):
|
|||||||
buff = ''.join(gen)
|
buff = ''.join(gen)
|
||||||
|
|
||||||
return (status_headers, buff)
|
return (status_headers, buff)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def main(): # pragma: no cover
|
|
||||||
import sys
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
msg = 'Usage: {0} url-to-fetch [wb-url-target] [extra-prefix]'
|
|
||||||
print msg.format(sys.argv[0])
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
url = sys.argv[1]
|
|
||||||
|
|
||||||
if len(sys.argv) >= 3:
|
|
||||||
wburl_str = sys.argv[2]
|
|
||||||
if wburl_str.startswith('/'):
|
|
||||||
wburl_str = wburl_str[1:]
|
|
||||||
|
|
||||||
prefix, wburl_str = wburl_str.split('/', 1)
|
|
||||||
prefix = '/' + prefix + '/'
|
|
||||||
else:
|
|
||||||
wburl_str = (datetime_to_timestamp(datetime.datetime.now()) +
|
|
||||||
'/http://example.com/path/sample.html')
|
|
||||||
prefix = '/pywb_rewrite/'
|
|
||||||
|
|
||||||
urlrewriter = UrlRewriter(wburl_str, prefix)
|
|
||||||
|
|
||||||
liverewriter = LiveRewriter()
|
|
||||||
|
|
||||||
status_headers, buff = liverewriter.get_rewritten(url, urlrewriter)
|
|
||||||
|
|
||||||
sys.stdout.write(buff)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit(main())
|
|
||||||
|
@ -196,8 +196,11 @@ class WbUrl(BaseWbUrl):
|
|||||||
@property
|
@property
|
||||||
def is_embed(self):
|
def is_embed(self):
|
||||||
return (self.mod and
|
return (self.mod and
|
||||||
self.mod != 'id_' and
|
self.mod not in ('id_', 'mp_', 'bn_'))
|
||||||
self.mod != 'mp_')
|
|
||||||
|
@property
|
||||||
|
def is_banner_only(self):
|
||||||
|
return (self.mod == 'bn_')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_identity(self):
|
def is_identity(self):
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
<!-- WB Insert -->
|
<!-- WB Insert -->
|
||||||
{% if rule.js_rewrite_location %}
|
{% if rule.js_rewrite_location and include_wombat %}
|
||||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
|
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
|
||||||
<script>
|
<script>
|
||||||
{% set urlsplit = cdx['original'] | urlsplit %}
|
{% set urlsplit = cdx['original'] | urlsplit %}
|
||||||
@ -16,9 +16,10 @@
|
|||||||
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
|
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
|
||||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||||
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
|
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
|
||||||
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}}
|
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
|
||||||
wbinfo.canon_url = "{{ canon_url }}";
|
wbinfo.canon_url = "{{ canon_url }}";
|
||||||
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
|
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
|
||||||
|
wbinfo.is_proxy_mode = {{ "true" if wbrequest.options.is_proxy else "false" }};
|
||||||
</script>
|
</script>
|
||||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>
|
||||||
|
@ -115,6 +115,14 @@ class StaticHandler(BaseHandler):
|
|||||||
try:
|
try:
|
||||||
data = self.block_loader.load(full_path)
|
data = self.block_loader.load(full_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data.seek(0, 2)
|
||||||
|
size = data.tell()
|
||||||
|
data.seek(0)
|
||||||
|
headers = [('Content-Length', str(size))]
|
||||||
|
except IOError:
|
||||||
|
headers = None
|
||||||
|
|
||||||
if 'wsgi.file_wrapper' in wbrequest.env:
|
if 'wsgi.file_wrapper' in wbrequest.env:
|
||||||
reader = wbrequest.env['wsgi.file_wrapper'](data)
|
reader = wbrequest.env['wsgi.file_wrapper'](data)
|
||||||
else:
|
else:
|
||||||
@ -122,7 +130,9 @@ class StaticHandler(BaseHandler):
|
|||||||
|
|
||||||
content_type, _ = mimetypes.guess_type(full_path)
|
content_type, _ = mimetypes.guess_type(full_path)
|
||||||
|
|
||||||
return WbResponse.text_stream(data, content_type=content_type)
|
return WbResponse.text_stream(data,
|
||||||
|
content_type=content_type,
|
||||||
|
headers=headers)
|
||||||
|
|
||||||
except IOError:
|
except IOError:
|
||||||
raise NotFoundException('Static File Not Found: ' +
|
raise NotFoundException('Static File Not Found: ' +
|
||||||
|
@ -99,8 +99,8 @@ class RewriteLiveView(BaseContentView):
|
|||||||
if ref_wburl_str:
|
if ref_wburl_str:
|
||||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||||
|
|
||||||
url = wbrequest.wb_url.url
|
wb_url = wbrequest.wb_url
|
||||||
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
|
result = self.rewriter.fetch_request(wb_url, wbrequest.urlrewriter,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
env=wbrequest.env)
|
env=wbrequest.env)
|
||||||
|
|
||||||
@ -211,14 +211,13 @@ class ReplayView(BaseContentView):
|
|||||||
create_insert_func(wbrequest))
|
create_insert_func(wbrequest))
|
||||||
|
|
||||||
result = (self.content_rewriter.
|
result = (self.content_rewriter.
|
||||||
rewrite_content(urlrewriter,
|
rewrite_content(wbrequest.wb_url,
|
||||||
|
urlrewriter,
|
||||||
headers=status_headers,
|
headers=status_headers,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
urlkey=cdx['urlkey'],
|
urlkey=cdx['urlkey'],
|
||||||
sanitize_only=wbrequest.wb_url.is_identity,
|
cdx=cdx))
|
||||||
cdx=cdx,
|
|
||||||
mod=wbrequest.wb_url.mod))
|
|
||||||
|
|
||||||
(status_headers, response_iter, is_rewritten) = result
|
(status_headers, response_iter, is_rewritten) = result
|
||||||
|
|
||||||
|
@ -121,16 +121,18 @@ def add_env_globals(glb):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HeadInsertView(J2TemplateView):
|
class HeadInsertView(J2TemplateView):
|
||||||
def create_insert_func(self, wbrequest, include_ts=True):
|
def create_insert_func(self, wbrequest,
|
||||||
|
include_ts=True):
|
||||||
|
|
||||||
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
|
canon_url = wbrequest.wb_prefix + wbrequest.wb_url.to_str(mod='')
|
||||||
include_ts = include_ts
|
include_wombat = not wbrequest.wb_url.is_banner_only
|
||||||
|
|
||||||
def make_head_insert(rule, cdx):
|
def make_head_insert(rule, cdx):
|
||||||
return (self.render_to_string(wbrequest=wbrequest,
|
return (self.render_to_string(wbrequest=wbrequest,
|
||||||
cdx=cdx,
|
cdx=cdx,
|
||||||
canon_url=canon_url,
|
canon_url=canon_url,
|
||||||
include_ts=include_ts,
|
include_ts=include_ts,
|
||||||
|
include_wombat=include_wombat,
|
||||||
rule=rule))
|
rule=rule))
|
||||||
return make_head_insert
|
return make_head_insert
|
||||||
|
|
||||||
|
@ -98,6 +98,7 @@ class TestWb:
|
|||||||
|
|
||||||
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
|
assert 'WB_wombat_init' in resp.body
|
||||||
assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body
|
assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
def test_replay_non_frame_content(self):
|
def test_replay_non_frame_content(self):
|
||||||
@ -141,6 +142,19 @@ class TestWb:
|
|||||||
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
|
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
|
||||||
|
|
||||||
|
|
||||||
|
def test_replay_banner_only(self):
|
||||||
|
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
|
||||||
|
|
||||||
|
# wb.js header insertion
|
||||||
|
assert 'wb.js' in resp.body
|
||||||
|
|
||||||
|
# no wombat present
|
||||||
|
assert 'WB_wombat_init' not in resp.body
|
||||||
|
|
||||||
|
# url not rewritten
|
||||||
|
#assert '"http://www.iana.org/domains/example"' in resp.body
|
||||||
|
assert '"/_css/2013.1/screen.css"' in resp.body
|
||||||
|
|
||||||
def test_replay_identity_1(self):
|
def test_replay_identity_1(self):
|
||||||
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user