From 83a33e05410acc9b96fc73356f74b328944efca3 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Thu, 10 Dec 2015 08:23:14 +0000 Subject: [PATCH] Resolve relative canonical paths if rewriting is disabled For Via, we want rel=canonical links to resolve to the same absolute URL as it did on the original page. For absolute URLs, no rewriting is necessary. If the original rel=canonical URL was relative however, it needs to be resolved relative to the original URL. See https://github.com/hypothesis/via/issues/65 for context. --- pywb/rewrite/html_rewriter.py | 17 ++++++++++--- pywb/rewrite/test/test_html_rewriter.py | 34 +++++++++++++------------ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 7c9020be..50629514 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -5,7 +5,7 @@ import sys import re from HTMLParser import HTMLParser, HTMLParseError -from urlparse import urlsplit, urlunsplit +from urlparse import urljoin, urlsplit, urlunsplit from url_rewriter import UrlRewriter from regex_rewriters import JSRewriter, CSSRewriter @@ -276,9 +276,18 @@ class HTMLRewriterMixin(object): # special case: if rewrite_canon not set, # don't rewrite rel=canonical elif tag == 'link' and attr_name == 'href': - if (self.opts.get('rewrite_rel_canon', True) or - not self.has_attr(tag_attrs, ('rel', 'canonical'))): - rw_mod = handler.get(attr_name) + rw_mod = handler.get(attr_name) + + if self.has_attr(tag_attrs, ('rel', 'canonical')): + if self.opts.get('rewrite_rel_canon', True): + attr_value = self._rewrite_url(attr_value, rw_mod) + else: + # resolve relative rel=canonical URLs so that they + # refer to the same absolute URL as on the original + # page (see https://github.com/hypothesis/via/issues/65 + # for context) + attr_value = urljoin(self.orig_url, attr_value) + else: attr_value = self._rewrite_url(attr_value, rw_mod) # special case: meta tag diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 820124f0..0ceface3 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -158,8 +158,12 @@ ur""" # rel=canonical: no_rewrite ->>> parse('', urlrewriter=no_base_canon_rewriter) - +>>> parse('', urlrewriter=no_base_canon_rewriter) + + +# rel=canonical: no_rewrite +>>> parse('', urlrewriter=no_base_canon_rewriter) + # doctype >>> parse('') @@ -210,26 +214,24 @@ from pywb.rewrite.html_rewriter import HTMLRewriter import pprint import urllib -urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', - '/web/', - rewrite_opts=dict(punycode_links=False)) +ORIGINAL_URL = 'http://example.com/some/path/index.html' -full_path_urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', - 'http://localhost:80/web/', - rewrite_opts=dict(punycode_links=False)) +def new_rewriter(prefix='/web/', rewrite_opts=dict()): + PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL) + return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts) -urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html', - '/web/', - rewrite_opts=dict(punycode_links=True)) +urlrewriter = new_rewriter(rewrite_opts=dict(punycode_links=False)) +full_path_urlrewriter = new_rewriter(prefix='http://localhost:80/web/', + rewrite_opts=dict(punycode_links=False)) -no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', - '/web/', - rewrite_opts=dict(rewrite_rel_canon=False, - rewrite_base=False)) +urlrewriter_pencode = new_rewriter(rewrite_opts=dict(punycode_links=True)) + +no_base_canon_rewriter = new_rewriter(rewrite_opts=dict(rewrite_rel_canon=False, + rewrite_base=False)) def parse(data, head_insert=None, urlrewriter=urlrewriter): - parser = HTMLRewriter(urlrewriter, head_insert = head_insert) + parser = HTMLRewriter(urlrewriter, head_insert = head_insert, url = ORIGINAL_URL) if isinstance(data, unicode): data = data.encode('utf-8')