Adding url canonicalization tests and handling of edge cases to reduce log noise

This commit is contained in:
Adam Miller 2022-04-26 23:48:54 +00:00
parent 9521042a23
commit 731cfe80cc
2 changed files with 23 additions and 13 deletions

View File

@ -68,6 +68,7 @@ import certauth.certauth
import warcprox import warcprox
import warcprox.main import warcprox.main
import warcprox.crawl_log as crawl_log
try: try:
import http.client as http_client import http.client as http_client
@ -2140,6 +2141,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
assert fields[11] == b'-' assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8')) extra_info = json.loads(fields[12].decode('utf-8'))
def test_crawl_log_canonicalization():
assert crawl_log.canonicalize_url(None) is None
assert crawl_log.canonicalize_url("") is ''
assert crawl_log.canonicalize_url("-") == '-'
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
def test_long_warcprox_meta( def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies): warcprox_, http_daemon, archiving_proxies, playback_proxies):
urls_before = warcprox_.proxy.running_stats.urls urls_before = warcprox_.proxy.running_stats.urls

View File

@ -69,8 +69,8 @@ class CrawlLogger(object):
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path') hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')) brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')) hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
if hop_path is None and brozzled_url is None and hop_via_url is None: if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided #No hop info headers provided
@ -148,15 +148,17 @@ class CrawlLogger(object):
else: else:
return recorded_url.status return recorded_url.status
def canonicalize_url(self, url): def canonicalize_url(url):
#URL needs to be split out to separately encode the hostname from the rest of the path. #URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode) #hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars. #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
try: if url is None or url == '-' or url == '':
parsed_url=rfc3986.urlparse(url) return url
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna')) try:
return encoded_url.unsplit() parsed_url=rfc3986.urlparse(url)
except (TypeError, ValueError) as e: encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e) return encoded_url.unsplit()
return url except (TypeError, ValueError, AttributeError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url