Adding url canonicalization tests and handling of edge cases to reduce log noise

2025-01-18 13:22:09 +01:00 · 2022-04-26 23:48:54 +00:00 · 2022-04-26 23:48:54 +00:00 · 731cfe80cc
commit 731cfe80cc
parent 9521042a23
2 changed files with 23 additions and 13 deletions
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -68,6 +68,7 @@ import certauth.certauth
 import warcprox
 import warcprox.main
 import warcprox.crawl_log as crawl_log
 try:
    import http.client as http_client
@ -2140,6 +2141,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
    assert fields[11] == b'-'
    extra_info = json.loads(fields[12].decode('utf-8'))
 def test_crawl_log_canonicalization():
    assert crawl_log.canonicalize_url(None) is None
    assert crawl_log.canonicalize_url("") is ''
    assert crawl_log.canonicalize_url("-") == '-'
    assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
    assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
 def test_long_warcprox_meta(
        warcprox_, http_daemon, archiving_proxies, playback_proxies):
    urls_before = warcprox_.proxy.running_stats.urls
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@ -69,8 +69,8 @@ class CrawlLogger(object):
        hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
        #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
-        brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
+        brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
-        hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
+        hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
        if hop_path is None and brozzled_url is None and hop_via_url is None:
            #No hop info headers provided
@ -148,15 +148,17 @@ class CrawlLogger(object):
        else:
            return recorded_url.status
-    def canonicalize_url(self, url):
+def canonicalize_url(url):
-        #URL needs to be split out to separately encode the hostname from the rest of the path.
+    #URL needs to be split out to separately encode the hostname from the rest of the path.
-        #hostname will be idna encoded (punycode)
+    #hostname will be idna encoded (punycode)
-        #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
+    #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
-        try:
+    if url is None or url == '-' or url == '':
-            parsed_url=rfc3986.urlparse(url)
+        return url
-            encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
+    try:
-            return encoded_url.unsplit()
+        parsed_url=rfc3986.urlparse(url)
-        except (TypeError, ValueError) as e:
+        encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
-            logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
+        return encoded_url.unsplit()
-            return url
+    except (TypeError, ValueError, AttributeError) as e:
        logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
        return url