diff --git a/setup.py b/setup.py index fc67e82..52af206 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.28', + version='2.4.29', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 9fa27e4..3ca74f2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -68,6 +68,7 @@ import certauth.certauth import warcprox import warcprox.main +import warcprox.crawl_log as crawl_log try: import http.client as http_client @@ -2141,6 +2142,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) +def test_crawl_log_canonicalization(): + assert crawl_log.canonicalize_url(None) is None + assert crawl_log.canonicalize_url("") is '' + assert crawl_log.canonicalize_url("-") == '-' + assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii" + assert crawl_log.canonicalize_url("Not a URL") == "Not a URL" + def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): urls_before = warcprox_.proxy.running_stats.urls diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 5ec4737..fdd3921 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -69,8 +69,8 @@ class CrawlLogger(object): hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path') #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly - brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')) - hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')) + brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')) + hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')) if hop_path is None and brozzled_url is None and hop_via_url is None: #No hop info headers provided @@ -148,15 +148,17 @@ class CrawlLogger(object): else: return recorded_url.status - def canonicalize_url(self, url): - #URL needs to be split out to separately encode the hostname from the rest of the path. - #hostname will be idna encoded (punycode) - #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars. - try: - parsed_url=rfc3986.urlparse(url) - encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna')) - return encoded_url.unsplit() - except (TypeError, ValueError) as e: - logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e) - return url +def canonicalize_url(url): + #URL needs to be split out to separately encode the hostname from the rest of the path. + #hostname will be idna encoded (punycode) + #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars. + if url is None or url == '-' or url == '': + return url + try: + parsed_url=rfc3986.urlparse(url) + encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna')) + return encoded_url.unsplit() + except (TypeError, ValueError, AttributeError) as e: + logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e) + return url