mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: optimize / sanity, only %-encode urls that are actually idna-encoded,
otherwise return as is, #66
This commit is contained in:
parent
afe49a91f4
commit
c4d5dd4690
@ -53,7 +53,9 @@ ur"""
|
|||||||
|
|
||||||
# Unicode -- default with %-encoding
|
# Unicode -- default with %-encoding
|
||||||
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
||||||
<a href="/web/20131226101010/http://%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/">испытание</a>
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
|
|
||||||
|
#<a href="/web/20131226101010/http://%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/">испытание</a>
|
||||||
|
|
||||||
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>', urlrewriter=urlrewriter_pencode)
|
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>', urlrewriter=urlrewriter_pencode)
|
||||||
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
@ -171,6 +173,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter
|
|||||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
|
import urllib
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
'/web/',
|
'/web/',
|
||||||
@ -187,9 +190,12 @@ no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/pat
|
|||||||
rewrite_base=False))
|
rewrite_base=False))
|
||||||
|
|
||||||
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
||||||
data = data.encode('utf-8')
|
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
#data = data.decode('utf-8')
|
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
#data = urllib.quote(data, ':" =/-\\<>')
|
||||||
|
|
||||||
result = parser.rewrite(data) + parser.close()
|
result = parser.rewrite(data) + parser.close()
|
||||||
# decode only for printing
|
# decode only for printing
|
||||||
print result.decode('utf-8')
|
print result.decode('utf-8')
|
||||||
|
@ -88,6 +88,9 @@ https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B
|
|||||||
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
|
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
|
||||||
http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
# invalid
|
||||||
|
>>> print(to_uri_pencode('http://xn--abcd'))
|
||||||
|
http://xn--abcd
|
||||||
|
|
||||||
# IRI representation
|
# IRI representation
|
||||||
>>> repr(WbUrl(u'http://пример.испытание'))
|
>>> repr(WbUrl(u'http://пример.испытание'))
|
||||||
|
@ -97,6 +97,10 @@ class WbUrl(BaseWbUrl):
|
|||||||
The rest of url should be unchanged
|
The rest of url should be unchanged
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# only continue if punycode encoded
|
||||||
|
if 'xn--' not in url:
|
||||||
|
return url
|
||||||
|
|
||||||
parts = urlparse.urlsplit(url)
|
parts = urlparse.urlsplit(url)
|
||||||
domain = parts.netloc
|
domain = parts.netloc
|
||||||
try:
|
try:
|
||||||
@ -108,10 +112,6 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
domain = urllib.quote(domain)#, safe=r':\/')
|
domain = urllib.quote(domain)#, safe=r':\/')
|
||||||
|
|
||||||
# no changes
|
|
||||||
if parts.netloc == domain:
|
|
||||||
return url
|
|
||||||
|
|
||||||
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
|
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user