wburl idn: more complete support for idn urls (#66)

add distinct to_iri() and to_uri() functions in WbUrl internal representation is always as ascii uri for rewriting, defaults to iri representation unless 'rewrite_ascii_only_urls' is set to true per collection add wbrequest.get_url() to get url as either iri or uri to be passed to templates
2025-03-15 00:03:28 +01:00 · 2015-01-26 09:52:04 -08:00 · 2015-01-26 09:52:04 -08:00 · 695245d9e8
commit 695245d9e8
parent edff3f17fb
10 changed files with 235 additions and 93 deletions
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@ -105,6 +105,18 @@ class WbRequest(object):

        self._parse_extra()

+    def get_url(self, url=None):
+        if not self.wb_url:
+            return None
+
+        if not url:
+            url = self.wb_url.url
+
+        if self.urlrewriter.rewrite_opts.get('rewrite_ascii_urls_only'):
+            return self.wb_url.url
+        else:
+            return self.wb_url.to_iri(url)
+
    def _is_ajax(self):
        value = self.env.get('HTTP_X_REQUESTED_WITH')
        if value and value.lower() == 'xmlhttprequest':
--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@ -4,94 +4,138 @@
 ur"""
 # Replay Urls
 # ======================
->>> repr(WbUrl('20131010000506/example.com'))
-"('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')"
+>>> repr_unicode(WbUrl('20131010000506/example.com'))
+('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')

->>> repr(WbUrl('20130102im_/https://example.com'))
-"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
+>>> repr_unicode(WbUrl('20130102im_/https://example.com'))
+('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')

->>> repr(WbUrl('20130102im_/https:/example.com'))
-"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
+>>> repr_unicode(WbUrl('20130102im_/https:/example.com'))
+('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')

 # Protocol agnostic convert to http
->>> repr(WbUrl('20130102im_///example.com'))
-"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')"
+>>> repr_unicode(WbUrl('20130102im_///example.com'))
+('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')

->>> repr(WbUrl('cs_/example.com'))
-"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')"
+>>> repr_unicode(WbUrl('cs_/example.com'))
+('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')

->>> repr(WbUrl('https://example.com/xyz'))
-"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
+>>> repr_unicode(WbUrl('https://example.com/xyz'))
+('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')

->>> repr(WbUrl('https:/example.com/xyz'))
-"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
+>>> repr_unicode(WbUrl('https:/example.com/xyz'))
+('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')

->>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
-"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
+>>> repr_unicode(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
+('latest_replay', '', '', 'https://example.com/xyz?a=/&b=.', 'https://example.com/xyz?a=/&b=.')

 # Test scheme partially encoded urls
->>> repr(WbUrl('https%3A//example.com/'))
-"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
+>>> repr_unicode(WbUrl('https%3A//example.com/'))
+('latest_replay', '', '', 'https://example.com/', 'https://example.com/')

->>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
-"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
+>>> repr_unicode(WbUrl('2014/http%3A%2F%2Fexample.com/'))
+('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')

 # Test IDNs
->>> repr(WbUrl(u'http://пример.испытание'))
-"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"

->>> repr(WbUrl(u'https://пример.испытание/abc/'))
-"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
+To IRI
+>>> print(WbUrl.to_iri(u'https://пример.испытание'))
+https://пример.испытание

->>> repr(WbUrl(u'//пример.испытание/abc/'))
-"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
+>>> print(WbUrl.to_iri(u'пример.испытание'))
+пример.испытание

->>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
-"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
+>>> print(WbUrl.to_iri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
+http://пример.испытание
+
+>>> print(WbUrl.to_iri(u'//пример.испытание/abc/испытание'))
+//пример.испытание/abc/испытание
+
+>>> print(WbUrl.to_iri(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
+пример.испытание/abc/пример
+
+>>> print(WbUrl.to_iri('https://xn--e1afmkfd.xn--80akhbyknj4f'))
+https://пример.испытание
+
+
+To URI
+>>> print(WbUrl.to_uri(u'https://пример.испытание'))
+https://xn--e1afmkfd.xn--80akhbyknj4f
+
+>>> print(WbUrl.to_uri(u'пример.испытание'))
+xn--e1afmkfd.xn--80akhbyknj4f
+
+>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
+http://xn--e1afmkfd.xn--80akhbyknj4f
+
+>>> print(WbUrl.to_uri(u'//пример.испытание/abc/испытание'))
+//xn--e1afmkfd.xn--80akhbyknj4f/abc%2F%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
+
+>>> print(WbUrl.to_uri('//' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
+//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
+
+>>> print(WbUrl.to_uri('https://xn--e1afmkfd.xn--80akhbyknj4f/abc/'))
+https://xn--e1afmkfd.xn--80akhbyknj4f/abc/
+
+>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
+http://xn--d0-olcluwd.xn--80akhbyknj4f
+
+# IRI representation
+>>> repr_unicode(WbUrl(u'http://пример.испытание'))
+('latest_replay', '', '', 'http://пример.испытание', 'http://пример.испытание')
+
+>>> repr_unicode(WbUrl(u'https://пример.испытание/abc/'))
+('latest_replay', '', '', 'https://пример.испытание/abc/', 'https://пример.испытание/abc/')
+
+>>> repr_unicode(WbUrl(u'//пример.испытание/abc/'))
+('latest_replay', '', '', 'http://пример.испытание/abc/', 'http://пример.испытание/abc/')
+
+>>> repr_unicode(WbUrl(u'2014id_/https://пример.испытание/abc'))
+('replay', '2014', 'id_', 'https://пример.испытание/abc', '2014id_/https://пример.испытание/abc')

 # percent-encoded form (as sent by browser usually)
->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
-"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
+>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
+('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc')

 # percent-encoded form -- scheme relative
->>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
-"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
+>>> repr_unicode(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
+('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc')

 # invalid: truncated and superfluous '%', ignore invalid (no exception)
->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
-"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
+>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
+('replay', '2014', 'id_', 'http://d0ример.испытание%/abc', '2014id_/http://d0ример.испытание%/abc')


 # Query Urls
 # ======================
->>> repr(WbUrl('*/http://example.com/abc?def=a'))
-"('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')"
+>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a'))
+('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')

->>> repr(WbUrl('*/http://example.com/abc?def=a*'))
-"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
+>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a*'))
+('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')

->>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
-"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
+>>> repr_unicode(WbUrl('2010*/http://example.com/abc?def=a'))
+('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')

 # timestamp range query
->>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
-"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
+>>> repr_unicode(WbUrl('2009-2015*/http://example.com/abc?def=a'))
+('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')

->>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
-"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
+>>> repr_unicode(WbUrl('json/*/http://example.com/abc?def=a'))
+('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')

->>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a'))
-"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')"
+>>> repr_unicode(WbUrl('timemap-link/2011*/http://example.com/abc?def=a'))
+('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')

 # strip off repeated, likely scheme-agnostic, slashes altogether
->>> repr(WbUrl('///example.com'))
-"('latest_replay', '', '', 'http://example.com', 'http://example.com')"
+>>> repr_unicode(WbUrl('///example.com'))
+('latest_replay', '', '', 'http://example.com', 'http://example.com')

->>> repr(WbUrl('//example.com/'))
-"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
+>>> repr_unicode(WbUrl('//example.com/'))
+('latest_replay', '', '', 'http://example.com/', 'http://example.com/')

->>> repr(WbUrl('/example.com/'))
-"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
+>>> repr_unicode(WbUrl('/example.com/'))
+('latest_replay', '', '', 'http://example.com/', 'http://example.com/')

 # Is_ Tests
 >>> u = WbUrl('*/http://example.com/abc?def=a*')
@ -131,7 +175,20 @@ Exception: ('Invalid WbUrl: ', '')
 """

 from pywb.rewrite.wburl import WbUrl
-from urllib import quote_plus
+from urllib import quote_plus, unquote_plus
+
+from StringIO import StringIO
+
+
+def repr_unicode(wburl):
+    buff = StringIO()
+    buff.write("('{0}', '{1}', '{2}', '".format(wburl.type, wburl.timestamp, wburl.mod))
+    buff.write(WbUrl.to_iri(wburl.url))
+    buff.write("', '")
+    buff.write(wburl.to_str(iri=True))
+    buff.write("')")
+    print(buff.getvalue())
+

 if __name__ == "__main__":
    import doctest
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@ -52,6 +52,9 @@ class UrlRewriter(object):
            is_abs = True
            url = 'http:' + url

+        # always convert any unicode urls to punycode
+        ascii_urls_only = self.rewrite_opts.get('rewrite_ascii_urls_only', False)
+
        # Optimized rewriter for
        # -rel urls that don't start with / and
        # do not contain ../ and no special mod
@ -68,7 +71,11 @@ class UrlRewriter(object):
            if mod is None:
                mod = wburl.mod

-            final_url = self.prefix + wburl.to_str(mod=mod, url=new_url)
+            final_url = self.prefix + wburl.to_str(mod=mod,
+                                                   url=new_url,
+                                                   iri=not ascii_urls_only)
+        if not ascii_urls_only:
+            final_url = final_url.encode('utf-8')

        return final_url

--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -90,6 +90,79 @@ class WbUrl(BaseWbUrl):
    #PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
    FIRST_PATH = re.compile('(?<![:/])/(?![/])')

+    @staticmethod
+    def to_iri(url):
+        if isinstance(url, str):
+            url = urllib.unquote_plus(url)
+            url = url.decode('utf-8')
+
+        parts = WbUrl.FIRST_PATH.split(url, 1)
+        scheme_dom = parts[0]
+
+        #scheme_dom = urllib.unquote_plus(parts[0])
+
+        #if isinstance(scheme_dom, str):
+        #    scheme_dom = scheme_dom.decode('utf-8', 'ignore')
+
+        scheme_dom = scheme_dom.rsplit(u'/', 1)
+        dom = scheme_dom[-1]
+
+        try:
+            dom = dom.decode('idna')
+        except:
+            pass
+
+        if len(scheme_dom) > 1:
+            url = scheme_dom[0] + u'/' + dom
+        else:
+            url = dom
+
+        if len(parts) > 1:
+            url += u'/' + parts[1]
+
+        return url
+
+
+    @staticmethod
+    def to_uri(url, was_uni=False):
+        #if not was_uni:
+        #    if isinstance(url, unicode):
+        #        was_uni = True
+
+        #if not was_uni and not '%' in url:
+        #    return url
+
+        parts = WbUrl.FIRST_PATH.split(url, 1)
+
+        #if not was_uni and not '%' in parts[0]:
+        #    return url
+
+        scheme_dom = urllib.unquote_plus(parts[0])
+
+        if isinstance(scheme_dom, str):
+            if scheme_dom == parts[0]:
+                return url
+
+            scheme_dom = scheme_dom.decode('utf-8', 'ignore')
+
+        scheme_dom = scheme_dom.rsplit('/', 1)
+        dom = scheme_dom[-1]
+
+        dom = dom.encode('idna')
+
+        if len(scheme_dom) > 1:
+            url = scheme_dom[0] + '/' + dom
+        else:
+            url = dom
+
+        if len(parts) > 1:
+            if isinstance(parts[1], unicode):
+                url += '/' + urllib.quote_plus(parts[1].encode('utf-8'))
+            else:
+                url += '/' + parts[1]
+
+        return url
+
    # ======================

    def __init__(self, orig_url):
@ -106,30 +179,7 @@ class WbUrl(BaseWbUrl):
            if not self._init_replay(orig_url):
                raise Exception('Invalid WbUrl: ', orig_url)

-        if was_uni or '%' in self.url:
-            parts = self.FIRST_PATH.split(self.url, 1)
-
-            if was_uni or '%' in parts[0]:
-                if not was_uni:
-                    scheme_dom = urllib.unquote_plus(parts[0])
-                else:
-                    scheme_dom = parts[0]
-
-                scheme_dom = scheme_dom.rsplit('/', 1)
-
-                dom = scheme_dom[-1]
-
-                dom = dom.decode('utf-8', 'ignore')
-                dom = dom.encode('idna')
-
-                if len(scheme_dom) > 1:
-                    self.url = scheme_dom[0] + '/' + dom
-                else:
-                    self.url = dom
-
-                if len(parts) > 1:
-                    self.url += '/' + parts[1]
-
+        self.url = WbUrl.to_uri(self.url, was_uni)

        # protocol agnostic url -> http://
        # no protocol -> http://
@ -208,6 +258,8 @@ class WbUrl(BaseWbUrl):
        timestamp = overrides.get('timestamp', self.timestamp)
        end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
        url = overrides.get('url', self.url)
+        if overrides.get('iri'):
+            url = WbUrl.to_iri(url)

        return self.to_wburl_str(url=url,
                                 type=type_,
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@ -132,9 +132,12 @@ this.load = function() {
        var hash = window.location.hash;

        var loc = window.location.href.replace(window.location.hash, "");
+        loc = decodeURI(loc);

        if (wbinfo.top_url && (loc != wbinfo.top_url) && wbinfo.mod != "bn_") {
            // Auto-redirect to top frame
+            console.log(wbinfo.top_url);
+            console.log(loc);
            window.location.replace(wbinfo.top_url + hash);
        } else {
            // Init Banner (no frame or top frame)
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@ -2,7 +2,7 @@
 {% if rule.js_rewrite_location != 'urls' and include_wombat %}
 <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
 <script>
-  {% set urlsplit = cdx.original | urlsplit %}
+  {% set urlsplit = cdx.url | urlsplit %}
  if (window && window._WBWombat && !window._wb_js_inited) {
  var _wb_wombat = new _WBWombat("{{ wbrequest.wb_prefix}}",
                 "{{ cdx['timestamp'] if include_ts else ''}}",
@ -15,7 +15,7 @@
 {% endif %}
 <script>
  wbinfo = {}
-  wbinfo.url = "{{ cdx.original }}";
+  wbinfo.url = "{{ cdx.url }}";
  wbinfo.timestamp = "{{ cdx.timestamp }}";
  wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
  wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
--- a/pywb/ui/query.html
+++ b/pywb/ui/query.html
@ -6,14 +6,14 @@ function ts_to_date(ts, is_gmt)
    if (ts.length < 14) {
        return ts;
    }
-    
-    var datestr = (ts.substring(0, 4) + "-" + 
+
+    var datestr = (ts.substring(0, 4) + "-" +
                  ts.substring(4, 6) + "-" +
                  ts.substring(6, 8) + "T" +
                  ts.substring(8, 10) + ":" +
                  ts.substring(10, 12) + ":" +
                  ts.substring(12, 14) + "-00:00");
-    
+
    var date = new Date(datestr);
    if (is_gmt) {
        return date.toGMTString();
@ -36,12 +36,12 @@ function ts_to_date(ts, is_gmt)
    </tr>
    {% for cdx in cdx_lines  %}
    <tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
-      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">
+      <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.url }}">
      <script>document.write(ts_to_date("{{ cdx['timestamp']}}", true))</script>
      </a></td>
-      <td>{{ cdx['statuscode'] }}</td>
-      <td>{{ cdx['original'] }}</td>
-      <td>{{ cdx['filename'] }}</td>
+      <td>{{ cdx.statuscode }}</td>
+      <td>{{ cdx.url }}</td>
+      <td>{{ cdx.filename }}</td>
    </tr>
    {% endfor %}
  </table>
--- a/pywb/webapp/handlers.py
+++ b/pywb/webapp/handlers.py
@ -72,7 +72,8 @@ class SearchPageWbUrlHandler(WbUrlHandler):
        return self.handle_request(wbrequest)

    def get_top_frame_params(self, wbrequest, mod=''):
-        embed_url = wbrequest.wb_url.to_str(mod=mod)
+        embed_url = wbrequest.wb_url.to_str(mod=mod, url='')
+        embed_url += wbrequest.get_url()

        if wbrequest.wb_url.timestamp:
            timestamp = wbrequest.wb_url.timestamp
@ -82,7 +83,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
        params = dict(embed_url=embed_url,
                      wbrequest=wbrequest,
                      timestamp=timestamp,
-                      url=wbrequest.wb_url.url,
+                      url=wbrequest.get_url(),
                      banner_html=self.banner_html)

        return params
--- a/pywb/webapp/views.py
+++ b/pywb/webapp/views.py
@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse
 from pywb.framework.memento import make_timemap, LINK_FORMAT

 import urlparse
+import urllib
 import logging

 from os import path
@ -128,12 +129,16 @@ class HeadInsertView(J2TemplateView):
    def create_insert_func(self, wbrequest,
                           include_ts=True):

+        url = wbrequest.get_url()
+
        top_url = wbrequest.wb_prefix
-        top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod)
+        top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod, url='')
+        top_url += url

        include_wombat = not wbrequest.wb_url.is_banner_only

        def make_head_insert(rule, cdx):
+            cdx['url'] = url
            return (self.render_to_string(wbrequest=wbrequest,
                                          cdx=cdx,
                                          top_url=top_url,
@ -165,9 +170,14 @@ class HeadInsertView(J2TemplateView):
 #=================================================================
 class J2HtmlCapturesView(J2TemplateView):
    def render_response(self, wbrequest, cdx_lines, **kwargs):
+        def format_cdx_lines():
+            for cdx in cdx_lines:
+                cdx['url'] = wbrequest.get_url(url=cdx['original'])
+                yield cdx
+
        return J2TemplateView.render_response(self,
-                                    cdx_lines=list(cdx_lines),
-                                    url=wbrequest.wb_url.url,
+                                    cdx_lines=list(format_cdx_lines()),
+                                    url=wbrequest.get_url(),
                                    type=wbrequest.wb_url.type,
                                    prefix=wbrequest.wb_prefix,
                                    **kwargs)
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -97,7 +97,7 @@ class TestWb:
        resp = self.testapp.get('/pywb/20140127171238tf_/http://www.iana.org/')

        assert '<iframe ' in resp.body
-        assert '/pywb/20140127171238/http://www.iana.org/' in resp.body
+        assert '/pywb/20140127171238/http://www.iana.org/' in resp.body, resp.body

    def test_replay_content(self):
        resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/')