From 1e3d22aba4a600dc54548449950dd4cbe055d1a5 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Wed, 20 Apr 2022 22:48:28 +0000 Subject: [PATCH] Better handle non-ascii urls for crawl log hop info --- setup.py | 1 + tests/Dockerfile | 63 +++++++++++++++++++++++++++++++----------- tests/run-tests.sh | 11 +++++--- tests/test_warcprox.py | 34 +++++++++++++++++++++++ warcprox/crawl_log.py | 24 ++++++++++++---- 5 files changed, 107 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 629f608..b3e2ba9 100755 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ deps = [ 'idna==2.10', 'PyYAML>=5.1', 'cachetools', + 'rfc3986>=2.0.0', ] try: import concurrent.futures diff --git a/tests/Dockerfile b/tests/Dockerfile index df9a688..24b6838 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -19,7 +19,7 @@ # USA. # -FROM phusion/baseimage +FROM ubuntu:focal-20220404 MAINTAINER Noah Levitt # see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile @@ -28,10 +28,11 @@ MAINTAINER Noah Levitt ENV LANG=C.UTF-8 RUN apt-get update && apt-get --auto-remove -y dist-upgrade +RUN apt-get install -y ca-certificates curl gnupg wget # Add the RethinkDB repository and public key -RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \ - && echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \ +RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add - +RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \ && apt-get update && apt-get -y install rethinkdb RUN mkdir -vp /etc/service/rethinkdb \ @@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \ && chmod a+x /etc/service/tor/run # hadoop hdfs for trough -RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \ - && . /etc/lsb-release \ - && echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list -RUN apt-get update -RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +RUN apt-get install -y openjdk-8-jdk openssh-server -RUN su hdfs -c 'hdfs namenode -format' +# set java home +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 -RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \ - && cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml +# setup ssh with no passphrase +RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \ + && cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys -RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \ - && cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^$, \n dfs.permissions.enabled\n false\n \n,' > /etc/hadoop/conf/hdfs-site.xml +RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \ + && tar xfz hadoop-2.7.3.tar.gz \ + && mv /hadoop-2.7.3 /usr/local/hadoop \ + && rm /hadoop-2.7.3.tar.gz + +# hadoop environment variables +ENV HADOOP_HOME=/usr/local/hadoop +ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin -RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \ - && chmod a+x /etc/my_init.d/50_start_hdfs.sh +# hadoop-store +RUN mkdir -p $HADOOP_HOME/hdfs/namenode \ + && mkdir -p $HADOOP_HOME/hdfs/datanode -RUN apt-get install -y libsqlite3-dev +# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html +COPY config/ /tmp/ +RUN mv /tmp/ssh_config $HOME/.ssh/config \ + && mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \ + && mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \ + && mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \ + && mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \ + && cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \ + && mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml + +# Add startup script +ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh + +# set permissions +RUN chmod 744 -R $HADOOP_HOME + +# format namenode +RUN $HADOOP_HOME/bin/hdfs namenode -format + +# run hadoop services +#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash + +RUN apt-get install -y libsqlite3-dev build-essential # trough itself RUN virtualenv -p python3 /opt/trough-ve3 \ @@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \ && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \ && chmod a+x /etc/service/trough-segment-manager-server/run +RUN apt-get install -y daemontools daemontools-run diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 12dd371..2268b6a 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" docker build -t internetarchive/warcprox-tests $script_dir -docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ +docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \ bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ && (cd /warcprox && git diff HEAD) | patch -p1 \ && virtualenv -p python3 /tmp/venv \ && source /tmp/venv/bin/activate \ - && pip --log-file /tmp/pip.log install . pytest mock requests warcio \ - && py.test -v tests \ - && py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ + && pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \ + && py.test -v tests; \ + svscan /etc/service & \ + sleep 10; \ + py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \ + && /usr/local/hadoop/hadoop-services.sh \ && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \ " diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 3a776ee..281338c 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -2106,6 +2106,40 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): extra_info = json.loads(fields[12].decode('utf-8')) assert set(extra_info.keys()) == {'exception'} + #Verify non-ascii urls are encoded properly + url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port + headers = { + "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8", + "metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}), + } + response = requests.get(url, proxies=archiving_proxies, headers=headers) + assert response.status_code == 200 + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9) + + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_8-%s-%s.log' % (hostname, port)) + + assert os.path.exists(file) + crawl_log_8 = open(file, 'rb').read() + assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8) + assert crawl_log_8[24:31] == b' 200 ' + assert crawl_log_8[31:42] == b' 154 ' + fields = crawl_log_8.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/b/%C2%B6-non-ascii') + assert fields[4] == b'L' + assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii') + assert fields[6] == b'text/plain' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439' + assert fields[10].endswith('/¶-non-ascii'.encode('utf-8')) + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): urls_before = warcprox_.proxy.running_stats.urls diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index e30b371..5ec4737 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -25,6 +25,7 @@ import json import os import warcprox import socket +import rfc3986 from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError class CrawlLogger(object): @@ -67,8 +68,9 @@ class CrawlLogger(object): logging.info('warcprox_meta %s' , recorded_url.warcprox_meta) hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path') - brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url') - hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url') + #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly + brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')) + hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')) if hop_path is None and brozzled_url is None and hop_via_url is None: #No hop info headers provided @@ -81,13 +83,11 @@ class CrawlLogger(object): hop_via_url = "-" #Prefer referer header. Otherwise use provided via_url via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-" + logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url) if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys(): #Requested page is not the Brozzled url, thus we are an embed or redirect. via_url = brozzled_url - if hop_path == "-": - hop_path = "B" - else: - hop_path = "".join([hop_path,"B"]) + hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"]) fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), @@ -148,3 +148,15 @@ class CrawlLogger(object): else: return recorded_url.status + def canonicalize_url(self, url): + #URL needs to be split out to separately encode the hostname from the rest of the path. + #hostname will be idna encoded (punycode) + #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars. + try: + parsed_url=rfc3986.urlparse(url) + encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna')) + return encoded_url.unsplit() + except (TypeError, ValueError) as e: + logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e) + return url +