mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #172 from internetarchive/adds-canonicalization-tests
Adding url canonicalization tests and handling of edge cases to reduc…
This commit is contained in:
commit
fcd9b2b3bd
@ -68,6 +68,7 @@ import certauth.certauth
|
||||
|
||||
import warcprox
|
||||
import warcprox.main
|
||||
import warcprox.crawl_log as crawl_log
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
@ -2140,6 +2141,13 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
|
||||
def test_crawl_log_canonicalization():
|
||||
assert crawl_log.canonicalize_url(None) is None
|
||||
assert crawl_log.canonicalize_url("") is ''
|
||||
assert crawl_log.canonicalize_url("-") == '-'
|
||||
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
|
||||
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
|
||||
|
||||
def test_long_warcprox_meta(
|
||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
@ -69,8 +69,8 @@ class CrawlLogger(object):
|
||||
|
||||
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
||||
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
|
||||
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||
brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||
hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||
|
||||
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
||||
#No hop info headers provided
|
||||
@ -148,15 +148,17 @@ class CrawlLogger(object):
|
||||
else:
|
||||
return recorded_url.status
|
||||
|
||||
def canonicalize_url(self, url):
|
||||
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||
#hostname will be idna encoded (punycode)
|
||||
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||
try:
|
||||
parsed_url=rfc3986.urlparse(url)
|
||||
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||
return encoded_url.unsplit()
|
||||
except (TypeError, ValueError) as e:
|
||||
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||
return url
|
||||
def canonicalize_url(url):
|
||||
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||
#hostname will be idna encoded (punycode)
|
||||
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||
if url is None or url == '-' or url == '':
|
||||
return url
|
||||
try:
|
||||
parsed_url=rfc3986.urlparse(url)
|
||||
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||
return encoded_url.unsplit()
|
||||
except (TypeError, ValueError, AttributeError) as e:
|
||||
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||
return url
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user