Merge pull request #172 from internetarchive/adds-canonicalization-tests

Adding url canonicalization tests and handling of edge cases to reduc…
2025-01-18 13:22:09 +01:00 · 2022-04-27 09:57:03 -07:00 · 2022-04-27 09:57:03 -07:00 · fcd9b2b3bd
commit fcd9b2b3bd
parent 9521042a23 731cfe80cc
2 changed files with 23 additions and 13 deletions
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -68,6 +68,7 @@ import certauth.certauth

 import warcprox
 import warcprox.main
+import warcprox.crawl_log as crawl_log

 try:
    import http.client as http_client
@ -2140,6 +2141,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
    assert fields[11] == b'-'
    extra_info = json.loads(fields[12].decode('utf-8'))

+def test_crawl_log_canonicalization():
+    assert crawl_log.canonicalize_url(None) is None
+    assert crawl_log.canonicalize_url("") is ''
+    assert crawl_log.canonicalize_url("-") == '-'
+    assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
+    assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
+
 def test_long_warcprox_meta(
        warcprox_, http_daemon, archiving_proxies, playback_proxies):
    urls_before = warcprox_.proxy.running_stats.urls
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@ -69,8 +69,8 @@ class CrawlLogger(object):

        hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
        #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
-        brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
-        hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
+        brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
+        hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))

        if hop_path is None and brozzled_url is None and hop_via_url is None:
            #No hop info headers provided
@ -148,15 +148,17 @@ class CrawlLogger(object):
        else:
            return recorded_url.status

-    def canonicalize_url(self, url):
-        #URL needs to be split out to separately encode the hostname from the rest of the path.
-        #hostname will be idna encoded (punycode)
-        #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
-        try:
-            parsed_url=rfc3986.urlparse(url)
-            encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
-            return encoded_url.unsplit()
-        except (TypeError, ValueError) as e:
-            logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
-            return url
+def canonicalize_url(url):
+    #URL needs to be split out to separately encode the hostname from the rest of the path.
+    #hostname will be idna encoded (punycode)
+    #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
+    if url is None or url == '-' or url == '':
+        return url
+    try:
+        parsed_url=rfc3986.urlparse(url)
+        encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
+        return encoded_url.unsplit()
+    except (TypeError, ValueError, AttributeError) as e:
+        logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
+        return url