mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge branch 'develop' 0.6.6 into video
This commit is contained in:
commit
7e36ad29e7
31
CHANGES.rst
31
CHANGES.rst
@ -1,7 +1,36 @@
|
|||||||
pywb 0.7.0 changelist
|
pywb 0.7.0 changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Video Buffering Replay
|
Video/streaming content replay and buffering improvements!
|
||||||
|
|
||||||
|
|
||||||
|
pywb 0.6.6 changelist
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options:
|
||||||
|
|
||||||
|
- `rewrite_base` - set to False to disable rewriting `<base href="...">` tag
|
||||||
|
- `rewrite_rel_canon` - set to false to disable rewriting `<link rel=canon href="...">`
|
||||||
|
|
||||||
|
* JS rewrite: Don't rewrite location if starting with '$'
|
||||||
|
|
||||||
|
|
||||||
|
pywb 0.6.5 changelist
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* fix static handling when content type can not be guessed, default to 'application/octet-stream'
|
||||||
|
|
||||||
|
* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
|
||||||
|
|
||||||
|
* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
|
||||||
|
|
||||||
|
* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
|
||||||
|
|
||||||
|
* don't rewrite rel=canonical links for services which rely on these
|
||||||
|
|
||||||
|
* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
|
||||||
|
error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
|
||||||
|
>>>>>>> develop
|
||||||
|
|
||||||
|
|
||||||
pywb 0.6.4 changelist
|
pywb 0.6.4 changelist
|
||||||
|
@ -58,10 +58,10 @@ running ``python setup.py install``:
|
|||||||
for all options.
|
for all options.
|
||||||
|
|
||||||
|
|
||||||
* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk.
|
* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk.
|
||||||
Includes most of the features of the `original cdx server implementation <https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server>`_,
|
Includes most of the features of the `original cdx server implementation <https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server>`_,
|
||||||
updated documentation coming soon.
|
updated documentation coming soon.
|
||||||
|
|
||||||
* ``proxy-cert-auth`` -- a utility to support proxy mode. It can be used in CA root certificate, or per-host certificate with an existing root cert.
|
* ``proxy-cert-auth`` -- a utility to support proxy mode. It can be used in CA root certificate, or per-host certificate with an existing root cert.
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,7 +62,8 @@ class ArchivalRouter(object):
|
|||||||
use_abs_prefix=use_abs_prefix,
|
use_abs_prefix=use_abs_prefix,
|
||||||
wburl_class=route.handler.get_wburl_type(),
|
wburl_class=route.handler.get_wburl_type(),
|
||||||
urlrewriter_class=UrlRewriter,
|
urlrewriter_class=UrlRewriter,
|
||||||
cookie_scope=route.cookie_scope)
|
cookie_scope=route.cookie_scope,
|
||||||
|
rewrite_opts=route.rewrite_opts)
|
||||||
|
|
||||||
# Allow for applying of additional filters
|
# Allow for applying of additional filters
|
||||||
route.apply_filters(wbrequest, matcher)
|
route.apply_filters(wbrequest, matcher)
|
||||||
@ -101,6 +102,7 @@ class Route(object):
|
|||||||
# collection id from regex group (default 0)
|
# collection id from regex group (default 0)
|
||||||
self.coll_group = coll_group
|
self.coll_group = coll_group
|
||||||
self.cookie_scope = config.get('cookie_scope')
|
self.cookie_scope = config.get('cookie_scope')
|
||||||
|
self.rewrite_opts = config.get('rewrite_opts', {})
|
||||||
self._custom_init(config)
|
self._custom_init(config)
|
||||||
|
|
||||||
def is_handling(self, request_uri):
|
def is_handling(self, request_uri):
|
||||||
|
@ -38,7 +38,8 @@ class WbRequest(object):
|
|||||||
wburl_class=None,
|
wburl_class=None,
|
||||||
urlrewriter_class=None,
|
urlrewriter_class=None,
|
||||||
is_proxy=False,
|
is_proxy=False,
|
||||||
cookie_scope=None):
|
cookie_scope=None,
|
||||||
|
rewrite_opts={}):
|
||||||
|
|
||||||
self.env = env
|
self.env = env
|
||||||
|
|
||||||
@ -77,7 +78,8 @@ class WbRequest(object):
|
|||||||
host_prefix + rel_prefix,
|
host_prefix + rel_prefix,
|
||||||
rel_prefix,
|
rel_prefix,
|
||||||
env.get('SCRIPT_NAME', '/'),
|
env.get('SCRIPT_NAME', '/'),
|
||||||
cookie_scope)
|
cookie_scope,
|
||||||
|
rewrite_opts)
|
||||||
|
|
||||||
self.urlrewriter.deprefix_url()
|
self.urlrewriter.deprefix_url()
|
||||||
else:
|
else:
|
||||||
|
@ -94,6 +94,9 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
self.rewrite_tags = self._init_rewrite_tags(defmod)
|
self.rewrite_tags = self._init_rewrite_tags(defmod)
|
||||||
|
|
||||||
|
# get opts from urlrewriter
|
||||||
|
self.opts = url_rewriter.rewrite_opts
|
||||||
|
|
||||||
# ===========================
|
# ===========================
|
||||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
|
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
|
||||||
re.IGNORECASE | re.MULTILINE)
|
re.IGNORECASE | re.MULTILINE)
|
||||||
@ -186,9 +189,11 @@ class HTMLRewriterMixin(object):
|
|||||||
elif attr_name == 'crossorigin':
|
elif attr_name == 'crossorigin':
|
||||||
attr_name = '_crossorigin'
|
attr_name = '_crossorigin'
|
||||||
|
|
||||||
# special case: link don't rewrite canonical
|
# special case: if rewrite_canon not set,
|
||||||
|
# don't rewrite rel=canonical
|
||||||
elif tag == 'link' and attr_name == 'href':
|
elif tag == 'link' and attr_name == 'href':
|
||||||
if not self.has_attr(tag_attrs, ('rel', 'canonical')):
|
if (self.opts.get('rewrite_rel_canon', True) or
|
||||||
|
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
||||||
rw_mod = handler.get(attr_name)
|
rw_mod = handler.get(attr_name)
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
@ -209,17 +214,21 @@ class HTMLRewriterMixin(object):
|
|||||||
rw_mod = 'oe_'
|
rw_mod = 'oe_'
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
|
# special case: base tag
|
||||||
|
elif (tag == 'base') and (attr_name == 'href') and attr_value:
|
||||||
|
rw_mod = handler.get(attr_name)
|
||||||
|
base_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
if self.opts.get('rewrite_base', True):
|
||||||
|
attr_value = base_value
|
||||||
|
self.url_rewriter = (self.url_rewriter.
|
||||||
|
rebase_rewriter(base_value))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# rewrite url using tag handler
|
# rewrite url using tag handler
|
||||||
rw_mod = handler.get(attr_name)
|
rw_mod = handler.get(attr_name)
|
||||||
if rw_mod is not None:
|
if rw_mod is not None:
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
# special case: base tag
|
|
||||||
if (tag == 'base') and (attr_name == 'href') and attr_value:
|
|
||||||
self.url_rewriter = (self.url_rewriter.
|
|
||||||
rebase_rewriter(attr_value))
|
|
||||||
|
|
||||||
# write the attr!
|
# write the attr!
|
||||||
self._write_attr(attr_name, attr_value)
|
self._write_attr(attr_name, attr_value)
|
||||||
|
|
||||||
|
@ -130,7 +130,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
|||||||
|
|
||||||
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
|
||||||
rules = rules + [
|
rules = rules + [
|
||||||
(r'(?<!/)\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<![/$])\blocation\b(?!\":)', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||||
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
(r'(?<=document\.)cookie', RegexRewriter.add_prefix(prefix), 0),
|
||||||
|
@ -20,13 +20,22 @@ ur"""
|
|||||||
#>>> parse('<input "selected"><img src></div>')
|
#>>> parse('<input "selected"><img src></div>')
|
||||||
#<input "selected"=""><img src=""></div>
|
#<input "selected"=""><img src=""></div>
|
||||||
|
|
||||||
# Base Tests
|
# Base Tests -- w/ rewrite (default)
|
||||||
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>')
|
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>')
|
||||||
<html><head><base href="/web/20131226101010/http://example.com/diff/path/file.html"/>
|
<html><head><base href="/web/20131226101010/http://example.com/diff/path/file.html"/>
|
||||||
|
|
||||||
>>> parse('<base href="static/"/><img src="image.gif"/>')
|
>>> parse('<base href="static/"/><img src="image.gif"/>')
|
||||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||||
|
|
||||||
|
# Base Tests -- no rewrite
|
||||||
|
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>', urlrewriter=no_base_canon_rewriter)
|
||||||
|
<html><head><base href="http://example.com/diff/path/file.html"/>
|
||||||
|
|
||||||
|
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
|
||||||
|
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# HTML Entities
|
# HTML Entities
|
||||||
>>> parse('<a href="">› > ?</div>')
|
>>> parse('<a href="">› > ?</div>')
|
||||||
<a href="">› > ?</div>
|
<a href="">› > ?</div>
|
||||||
@ -106,8 +115,12 @@ ur"""
|
|||||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||||
|
|
||||||
# don't rewrite rel=canonical
|
# rel=canonical: rewrite (default)
|
||||||
>>> parse('<link rel=canonical href="http://example.com/">')
|
>>> parse('<link rel=canonical href="http://example.com/">')
|
||||||
|
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
|
||||||
|
|
||||||
|
# rel=canonical: no_rewrite
|
||||||
|
>>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter)
|
||||||
<link rel="canonical" href="http://example.com/">
|
<link rel="canonical" href="http://example.com/">
|
||||||
|
|
||||||
# doctype
|
# doctype
|
||||||
@ -147,7 +160,12 @@ import pprint
|
|||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||||
|
|
||||||
def parse(data, head_insert = None):
|
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
|
'/web/',
|
||||||
|
rewrite_opts=dict(rewrite_rel_canon=False,
|
||||||
|
rewrite_base=False))
|
||||||
|
|
||||||
|
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
#data = data.decode('utf-8')
|
#data = data.decode('utf-8')
|
||||||
result = parser.rewrite(data) + parser.close()
|
result = parser.rewrite(data) + parser.close()
|
||||||
|
@ -20,13 +20,14 @@ class UrlRewriter(object):
|
|||||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||||
|
|
||||||
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
|
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
|
||||||
root_path=None, cookie_scope=None):
|
root_path=None, cookie_scope=None, rewrite_opts={}):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.full_prefix = full_prefix
|
self.full_prefix = full_prefix
|
||||||
self.rel_prefix = rel_prefix if rel_prefix else prefix
|
self.rel_prefix = rel_prefix if rel_prefix else prefix
|
||||||
self.root_path = root_path if root_path else '/'
|
self.root_path = root_path if root_path else '/'
|
||||||
self.cookie_scope = cookie_scope
|
self.cookie_scope = cookie_scope
|
||||||
|
self.rewrite_opts = rewrite_opts
|
||||||
|
|
||||||
def rewrite(self, url, mod=None):
|
def rewrite(self, url, mod=None):
|
||||||
# if special protocol, no rewriting at all
|
# if special protocol, no rewriting at all
|
||||||
|
@ -193,7 +193,11 @@ class StaticHandler(BaseHandler):
|
|||||||
else:
|
else:
|
||||||
reader = iter(lambda: data.read(), '')
|
reader = iter(lambda: data.read(), '')
|
||||||
|
|
||||||
content_type, _ = mimetypes.guess_type(full_path)
|
content_type = 'application/octet-stream'
|
||||||
|
|
||||||
|
guessed = mimetypes.guess_type(full_path)
|
||||||
|
if guessed[0]:
|
||||||
|
content_type = guessed[0]
|
||||||
|
|
||||||
return WbResponse.text_stream(reader,
|
return WbResponse.text_stream(reader,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user