mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
html rewrite: add trailing slash for <base> tag rewrite if url is a scheme://host
with no path component #77 cleanup: remove unused code path for tags with no rewriting -- all tags now checked for dynamic attrs which may need rewriting update tests, including live rewrite test dependent on live site (FB)
This commit is contained in:
parent
1fb631870b
commit
24021fcd57
@ -5,6 +5,7 @@ import sys
|
||||
import re
|
||||
|
||||
from HTMLParser import HTMLParser, HTMLParseError
|
||||
from urlparse import urlsplit
|
||||
|
||||
from url_rewriter import UrlRewriter
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
@ -121,7 +122,22 @@ class HTMLRewriterMixin(object):
|
||||
meta_refresh[m.end(1):])
|
||||
|
||||
return meta_refresh
|
||||
# ===========================
|
||||
|
||||
def _rewrite_base(self, value, mod=''):
|
||||
if not value.endswith('/'):
|
||||
# check if hostname with no path,
|
||||
# eg http://example.com
|
||||
if not urlsplit(value).path:
|
||||
value += '/'
|
||||
|
||||
base_value = self._rewrite_url(value, mod)
|
||||
|
||||
if self.opts.get('rewrite_base', True):
|
||||
value = base_value
|
||||
|
||||
self.url_rewriter = (self.url_rewriter.
|
||||
rebase_rewriter(base_value))
|
||||
return value
|
||||
|
||||
def _rewrite_url(self, value, mod=None):
|
||||
if value:
|
||||
@ -221,12 +237,7 @@ class HTMLRewriterMixin(object):
|
||||
# special case: base tag
|
||||
elif (tag == 'base') and (attr_name == 'href') and attr_value:
|
||||
rw_mod = handler.get(attr_name)
|
||||
base_value = self._rewrite_url(attr_value, rw_mod)
|
||||
if self.opts.get('rewrite_base', True):
|
||||
attr_value = base_value
|
||||
self.url_rewriter = (self.url_rewriter.
|
||||
rebase_rewriter(base_value))
|
||||
|
||||
attr_value = self._rewrite_base(attr_value, rw_mod)
|
||||
else:
|
||||
# rewrite url using tag handler
|
||||
rw_mod = handler.get(attr_name)
|
||||
@ -338,15 +349,15 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
return s
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if not self._rewrite_tag_attrs(tag, attrs):
|
||||
self.out.write(self.get_starttag_text())
|
||||
elif tag != 'head' or not self._rewrite_head(False):
|
||||
self._rewrite_tag_attrs(tag, attrs)
|
||||
|
||||
if tag != 'head' or not self._rewrite_head(False):
|
||||
self.out.write('>')
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if not self._rewrite_tag_attrs(tag, attrs):
|
||||
self.out.write(self.get_starttag_text())
|
||||
elif tag != 'head' or not self._rewrite_head(True):
|
||||
self._rewrite_tag_attrs(tag, attrs)
|
||||
|
||||
if tag != 'head' or not self._rewrite_head(True):
|
||||
self.out.write('/>')
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
|
@ -27,6 +27,10 @@ ur"""
|
||||
>>> parse('<base href="static/"/><img src="image.gif"/>')
|
||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
|
||||
# ensure trailing slash added
|
||||
>>> parse('<base href="http://example.com"/>')
|
||||
<base href="/web/20131226101010/http://example.com/"/>
|
||||
|
||||
# Base Tests -- no rewrite
|
||||
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>', urlrewriter=no_base_canon_rewriter)
|
||||
<html><head><base href="http://example.com/diff/path/file.html"/>
|
||||
|
@ -203,8 +203,9 @@ def test_example_4_rewrite_err():
|
||||
def test_example_domain_specific_3():
|
||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter, follow_redirects=True)
|
||||
|
||||
# comment out bootloader
|
||||
assert '/* Bootloader.configurePage' in buff
|
||||
# comment out Bootloader.configurePage, if it is still there
|
||||
if 'Bootloader.configurePage' in buff:
|
||||
assert '/* Bootloader.configurePage' in buff
|
||||
|
||||
def test_wombat_top():
|
||||
#status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter)
|
||||
|
Loading…
x
Reference in New Issue
Block a user