Better handle non-ascii urls for crawl log hop info

This commit is contained in:
Adam Miller 2022-04-20 22:48:28 +00:00
parent 5ae1291e37
commit 1e3d22aba4
5 changed files with 107 additions and 26 deletions

View File

@ -35,6 +35,7 @@ deps = [
'idna==2.10', 'idna==2.10',
'PyYAML>=5.1', 'PyYAML>=5.1',
'cachetools', 'cachetools',
'rfc3986>=2.0.0',
] ]
try: try:
import concurrent.futures import concurrent.futures

View File

@ -19,7 +19,7 @@
# USA. # USA.
# #
FROM phusion/baseimage FROM ubuntu:focal-20220404
MAINTAINER Noah Levitt <nlevitt@archive.org> MAINTAINER Noah Levitt <nlevitt@archive.org>
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile # see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update && apt-get --auto-remove -y dist-upgrade RUN apt-get update && apt-get --auto-remove -y dist-upgrade
RUN apt-get install -y ca-certificates curl gnupg wget
# Add the RethinkDB repository and public key # Add the RethinkDB repository and public key
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \ RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \ RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
&& apt-get update && apt-get -y install rethinkdb && apt-get update && apt-get -y install rethinkdb
RUN mkdir -vp /etc/service/rethinkdb \ RUN mkdir -vp /etc/service/rethinkdb \
@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \
&& chmod a+x /etc/service/tor/run && chmod a+x /etc/service/tor/run
# hadoop hdfs for trough # hadoop hdfs for trough
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
&& . /etc/lsb-release \
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
RUN apt-get update ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo ENV TZ=Etc/UTC
RUN apt-get install -y openjdk-8-jdk openssh-server
RUN su hdfs -c 'hdfs namenode -format' # set java home
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \ # setup ssh with no passphrase
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \ RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml && tar xfz hadoop-2.7.3.tar.gz \
&& mv /hadoop-2.7.3 /usr/local/hadoop \
&& rm /hadoop-2.7.3.tar.gz
# hadoop environment variables
ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \ # hadoop-store
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
&& mkdir -p $HADOOP_HOME/hdfs/datanode
RUN apt-get install -y libsqlite3-dev # Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
COPY config/ /tmp/
RUN mv /tmp/ssh_config $HOME/.ssh/config \
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
# Add startup script
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
# set permissions
RUN chmod 744 -R $HADOOP_HOME
# format namenode
RUN $HADOOP_HOME/bin/hdfs namenode -format
# run hadoop services
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
RUN apt-get install -y libsqlite3-dev build-essential
# trough itself # trough itself
RUN virtualenv -p python3 /opt/trough-ve3 \ RUN virtualenv -p python3 /opt/trough-ve3 \
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \ && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
&& chmod a+x /etc/service/trough-segment-manager-server/run && chmod a+x /etc/service/trough-segment-manager-server/run
RUN apt-get install -y daemontools daemontools-run

View File

@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
docker build -t internetarchive/warcprox-tests $script_dir docker build -t internetarchive/warcprox-tests $script_dir
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
&& (cd /warcprox && git diff HEAD) | patch -p1 \ && (cd /warcprox && git diff HEAD) | patch -p1 \
&& virtualenv -p python3 /tmp/venv \ && virtualenv -p python3 /tmp/venv \
&& source /tmp/venv/bin/activate \ && source /tmp/venv/bin/activate \
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \ && pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
&& py.test -v tests \ && py.test -v tests; \
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ svscan /etc/service & \
sleep 10; \
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \ && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
&& /usr/local/hadoop/hadoop-services.sh \
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \ && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
" "

View File

@ -2106,6 +2106,40 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
extra_info = json.loads(fields[12].decode('utf-8')) extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {'exception'} assert set(extra_info.keys()) == {'exception'}
#Verify non-ascii urls are encoded properly
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
headers = {
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 200
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_8-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
crawl_log_8 = open(file, 'rb').read()
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
assert crawl_log_8[24:31] == b' 200 '
assert crawl_log_8[31:42] == b' 154 '
fields = crawl_log_8.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
assert fields[4] == b'L'
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
def test_long_warcprox_meta( def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies): warcprox_, http_daemon, archiving_proxies, playback_proxies):
urls_before = warcprox_.proxy.running_stats.urls urls_before = warcprox_.proxy.running_stats.urls

View File

@ -25,6 +25,7 @@ import json
import os import os
import warcprox import warcprox
import socket import socket
import rfc3986
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
class CrawlLogger(object): class CrawlLogger(object):
@ -67,8 +68,9 @@ class CrawlLogger(object):
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta) logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path') hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url') #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url') brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
if hop_path is None and brozzled_url is None and hop_via_url is None: if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided #No hop info headers provided
@ -81,13 +83,11 @@ class CrawlLogger(object):
hop_via_url = "-" hop_via_url = "-"
#Prefer referer header. Otherwise use provided via_url #Prefer referer header. Otherwise use provided via_url
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-" via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys(): if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
#Requested page is not the Brozzled url, thus we are an embed or redirect. #Requested page is not the Brozzled url, thus we are an embed or redirect.
via_url = brozzled_url via_url = brozzled_url
if hop_path == "-": hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
hop_path = "B"
else:
hop_path = "".join([hop_path,"B"])
fields = [ fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
@ -148,3 +148,15 @@ class CrawlLogger(object):
else: else:
return recorded_url.status return recorded_url.status
def canonicalize_url(self, url):
#URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
try:
parsed_url=rfc3986.urlparse(url)
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
return encoded_url.unsplit()
except (TypeError, ValueError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url