mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Resolve relative canonical paths if rewriting is disabled
For Via, we want rel=canonical links to resolve to the same absolute URL as it did on the original page. For absolute URLs, no rewriting is necessary. If the original rel=canonical URL was relative however, it needs to be resolved relative to the original URL. See https://github.com/hypothesis/via/issues/65 for context.
This commit is contained in:
parent
7a0680fb35
commit
83a33e0541
@ -5,7 +5,7 @@ import sys
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from HTMLParser import HTMLParser, HTMLParseError
|
from HTMLParser import HTMLParser, HTMLParseError
|
||||||
from urlparse import urlsplit, urlunsplit
|
from urlparse import urljoin, urlsplit, urlunsplit
|
||||||
|
|
||||||
from url_rewriter import UrlRewriter
|
from url_rewriter import UrlRewriter
|
||||||
from regex_rewriters import JSRewriter, CSSRewriter
|
from regex_rewriters import JSRewriter, CSSRewriter
|
||||||
@ -276,9 +276,18 @@ class HTMLRewriterMixin(object):
|
|||||||
# special case: if rewrite_canon not set,
|
# special case: if rewrite_canon not set,
|
||||||
# don't rewrite rel=canonical
|
# don't rewrite rel=canonical
|
||||||
elif tag == 'link' and attr_name == 'href':
|
elif tag == 'link' and attr_name == 'href':
|
||||||
if (self.opts.get('rewrite_rel_canon', True) or
|
rw_mod = handler.get(attr_name)
|
||||||
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
|
|
||||||
rw_mod = handler.get(attr_name)
|
if self.has_attr(tag_attrs, ('rel', 'canonical')):
|
||||||
|
if self.opts.get('rewrite_rel_canon', True):
|
||||||
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
else:
|
||||||
|
# resolve relative rel=canonical URLs so that they
|
||||||
|
# refer to the same absolute URL as on the original
|
||||||
|
# page (see https://github.com/hypothesis/via/issues/65
|
||||||
|
# for context)
|
||||||
|
attr_value = urljoin(self.orig_url, attr_value)
|
||||||
|
else:
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
# special case: meta tag
|
# special case: meta tag
|
||||||
|
@ -158,8 +158,12 @@ ur"""
|
|||||||
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
|
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
|
||||||
|
|
||||||
# rel=canonical: no_rewrite
|
# rel=canonical: no_rewrite
|
||||||
>>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter)
|
>>> parse('<link rel=canonical href="http://example.com/canon/path">', urlrewriter=no_base_canon_rewriter)
|
||||||
<link rel="canonical" href="http://example.com/">
|
<link rel="canonical" href="http://example.com/canon/path">
|
||||||
|
|
||||||
|
# rel=canonical: no_rewrite
|
||||||
|
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
|
||||||
|
<link rel="canonical" href="http://example.com/relative/path">
|
||||||
|
|
||||||
# doctype
|
# doctype
|
||||||
>>> parse('<!doctype html PUBLIC "public">')
|
>>> parse('<!doctype html PUBLIC "public">')
|
||||||
@ -210,26 +214,24 @@ from pywb.rewrite.html_rewriter import HTMLRewriter
|
|||||||
import pprint
|
import pprint
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
ORIGINAL_URL = 'http://example.com/some/path/index.html'
|
||||||
'/web/',
|
|
||||||
rewrite_opts=dict(punycode_links=False))
|
|
||||||
|
|
||||||
full_path_urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
def new_rewriter(prefix='/web/', rewrite_opts=dict()):
|
||||||
'http://localhost:80/web/',
|
PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL)
|
||||||
rewrite_opts=dict(punycode_links=False))
|
return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts)
|
||||||
|
|
||||||
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
urlrewriter = new_rewriter(rewrite_opts=dict(punycode_links=False))
|
||||||
'/web/',
|
|
||||||
rewrite_opts=dict(punycode_links=True))
|
|
||||||
|
|
||||||
|
full_path_urlrewriter = new_rewriter(prefix='http://localhost:80/web/',
|
||||||
|
rewrite_opts=dict(punycode_links=False))
|
||||||
|
|
||||||
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
urlrewriter_pencode = new_rewriter(rewrite_opts=dict(punycode_links=True))
|
||||||
'/web/',
|
|
||||||
rewrite_opts=dict(rewrite_rel_canon=False,
|
no_base_canon_rewriter = new_rewriter(rewrite_opts=dict(rewrite_rel_canon=False,
|
||||||
rewrite_base=False))
|
rewrite_base=False))
|
||||||
|
|
||||||
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert, url = ORIGINAL_URL)
|
||||||
|
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
data = data.encode('utf-8')
|
data = data.encode('utf-8')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user