Merge branch 'adds-canonicalization-tests' into qa

This commit is contained in:
Adam Miller 2022-04-26 23:49:12 +00:00
commit aa4a550b12
3 changed files with 24 additions and 14 deletions

View File

@ -44,7 +44,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.28',
version='2.4.29',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -68,6 +68,7 @@ import certauth.certauth
import warcprox
import warcprox.main
import warcprox.crawl_log as crawl_log
try:
import http.client as http_client
@ -2141,6 +2142,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
def test_crawl_log_canonicalization():
assert crawl_log.canonicalize_url(None) is None
assert crawl_log.canonicalize_url("") is ''
assert crawl_log.canonicalize_url("-") == '-'
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies):
urls_before = warcprox_.proxy.running_stats.urls

View File

@ -69,8 +69,8 @@ class CrawlLogger(object):
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided
@ -148,15 +148,17 @@ class CrawlLogger(object):
else:
return recorded_url.status
def canonicalize_url(self, url):
#URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
try:
parsed_url=rfc3986.urlparse(url)
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
return encoded_url.unsplit()
except (TypeError, ValueError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url
def canonicalize_url(url):
#URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
if url is None or url == '-' or url == '':
return url
try:
parsed_url=rfc3986.urlparse(url)
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
return encoded_url.unsplit()
except (TypeError, ValueError, AttributeError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url