1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge pull request #162 from robertknight/resolve_relative_rel_canon

rewrite: Resolve relative canonical paths if rewriting is disabled
This commit is contained in:
Ilya Kreymer 2015-12-10 11:33:15 -08:00
commit b12a24e990
2 changed files with 31 additions and 20 deletions

View File

@ -5,7 +5,7 @@ import sys
import re import re
from HTMLParser import HTMLParser, HTMLParseError from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urlsplit, urlunsplit from urlparse import urljoin, urlsplit, urlunsplit
from url_rewriter import UrlRewriter from url_rewriter import UrlRewriter
from regex_rewriters import JSRewriter, CSSRewriter from regex_rewriters import JSRewriter, CSSRewriter
@ -276,9 +276,18 @@ class HTMLRewriterMixin(object):
# special case: if rewrite_canon not set, # special case: if rewrite_canon not set,
# don't rewrite rel=canonical # don't rewrite rel=canonical
elif tag == 'link' and attr_name == 'href': elif tag == 'link' and attr_name == 'href':
if (self.opts.get('rewrite_rel_canon', True) or rw_mod = handler.get(attr_name)
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
rw_mod = handler.get(attr_name) if self.has_attr(tag_attrs, ('rel', 'canonical')):
if self.opts.get('rewrite_rel_canon', True):
attr_value = self._rewrite_url(attr_value, rw_mod)
else:
# resolve relative rel=canonical URLs so that they
# refer to the same absolute URL as on the original
# page (see https://github.com/hypothesis/via/issues/65
# for context)
attr_value = urljoin(self.orig_url, attr_value)
else:
attr_value = self._rewrite_url(attr_value, rw_mod) attr_value = self._rewrite_url(attr_value, rw_mod)
# special case: meta tag # special case: meta tag

View File

@ -158,8 +158,12 @@ ur"""
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/"> <link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
# rel=canonical: no_rewrite # rel=canonical: no_rewrite
>>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter) >>> parse('<link rel=canonical href="http://example.com/canon/path">', urlrewriter=no_base_canon_rewriter)
<link rel="canonical" href="http://example.com/"> <link rel="canonical" href="http://example.com/canon/path">
# rel=canonical: no_rewrite
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
<link rel="canonical" href="http://example.com/relative/path">
# doctype # doctype
>>> parse('<!doctype html PUBLIC "public">') >>> parse('<!doctype html PUBLIC "public">')
@ -210,26 +214,24 @@ from pywb.rewrite.html_rewriter import HTMLRewriter
import pprint import pprint
import urllib import urllib
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', ORIGINAL_URL = 'http://example.com/some/path/index.html'
'/web/',
rewrite_opts=dict(punycode_links=False))
full_path_urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', def new_rewriter(prefix='/web/', rewrite_opts=dict()):
'http://localhost:80/web/', PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL)
rewrite_opts=dict(punycode_links=False)) return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts)
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html', urlrewriter = new_rewriter(rewrite_opts=dict(punycode_links=False))
'/web/',
rewrite_opts=dict(punycode_links=True))
full_path_urlrewriter = new_rewriter(prefix='http://localhost:80/web/',
rewrite_opts=dict(punycode_links=False))
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', urlrewriter_pencode = new_rewriter(rewrite_opts=dict(punycode_links=True))
'/web/',
rewrite_opts=dict(rewrite_rel_canon=False, no_base_canon_rewriter = new_rewriter(rewrite_opts=dict(rewrite_rel_canon=False,
rewrite_base=False)) rewrite_base=False))
def parse(data, head_insert=None, urlrewriter=urlrewriter): def parse(data, head_insert=None, urlrewriter=urlrewriter):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert) parser = HTMLRewriter(urlrewriter, head_insert = head_insert, url = ORIGINAL_URL)
if isinstance(data, unicode): if isinstance(data, unicode):
data = data.encode('utf-8') data = data.encode('utf-8')