mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Better handle non-ascii urls for crawl log hop info
This commit is contained in:
parent
5ae1291e37
commit
1e3d22aba4
1
setup.py
1
setup.py
@ -35,6 +35,7 @@ deps = [
|
|||||||
'idna==2.10',
|
'idna==2.10',
|
||||||
'PyYAML>=5.1',
|
'PyYAML>=5.1',
|
||||||
'cachetools',
|
'cachetools',
|
||||||
|
'rfc3986>=2.0.0',
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
# USA.
|
# USA.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM phusion/baseimage
|
FROM ubuntu:focal-20220404
|
||||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||||
|
|
||||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||||
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
|
|||||||
ENV LANG=C.UTF-8
|
ENV LANG=C.UTF-8
|
||||||
|
|
||||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||||
|
RUN apt-get install -y ca-certificates curl gnupg wget
|
||||||
|
|
||||||
# Add the RethinkDB repository and public key
|
# Add the RethinkDB repository and public key
|
||||||
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
|
RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
|
||||||
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
|
RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||||
&& apt-get update && apt-get -y install rethinkdb
|
&& apt-get update && apt-get -y install rethinkdb
|
||||||
|
|
||||||
RUN mkdir -vp /etc/service/rethinkdb \
|
RUN mkdir -vp /etc/service/rethinkdb \
|
||||||
@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \
|
|||||||
&& chmod a+x /etc/service/tor/run
|
&& chmod a+x /etc/service/tor/run
|
||||||
|
|
||||||
# hadoop hdfs for trough
|
# hadoop hdfs for trough
|
||||||
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
|
|
||||||
&& . /etc/lsb-release \
|
|
||||||
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
|
|
||||||
|
|
||||||
RUN apt-get update
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
|
ENV TZ=Etc/UTC
|
||||||
|
RUN apt-get install -y openjdk-8-jdk openssh-server
|
||||||
|
|
||||||
RUN su hdfs -c 'hdfs namenode -format'
|
# set java home
|
||||||
|
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
||||||
|
|
||||||
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
|
# setup ssh with no passphrase
|
||||||
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
|
RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
|
||||||
|
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
|
||||||
|
|
||||||
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
|
RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
|
||||||
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
|
&& tar xfz hadoop-2.7.3.tar.gz \
|
||||||
|
&& mv /hadoop-2.7.3 /usr/local/hadoop \
|
||||||
|
&& rm /hadoop-2.7.3.tar.gz
|
||||||
|
|
||||||
|
# hadoop environment variables
|
||||||
|
ENV HADOOP_HOME=/usr/local/hadoop
|
||||||
|
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
|
||||||
|
|
||||||
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
|
# hadoop-store
|
||||||
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
|
RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
|
||||||
|
&& mkdir -p $HADOOP_HOME/hdfs/datanode
|
||||||
|
|
||||||
RUN apt-get install -y libsqlite3-dev
|
# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
|
||||||
|
COPY config/ /tmp/
|
||||||
|
RUN mv /tmp/ssh_config $HOME/.ssh/config \
|
||||||
|
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
|
||||||
|
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
|
||||||
|
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
|
||||||
|
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
|
||||||
|
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
|
||||||
|
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||||
|
|
||||||
|
# Add startup script
|
||||||
|
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
|
||||||
|
|
||||||
|
# set permissions
|
||||||
|
RUN chmod 744 -R $HADOOP_HOME
|
||||||
|
|
||||||
|
# format namenode
|
||||||
|
RUN $HADOOP_HOME/bin/hdfs namenode -format
|
||||||
|
|
||||||
|
# run hadoop services
|
||||||
|
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
|
||||||
|
|
||||||
|
RUN apt-get install -y libsqlite3-dev build-essential
|
||||||
|
|
||||||
# trough itself
|
# trough itself
|
||||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||||
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
|
|||||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
||||||
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
||||||
|
|
||||||
|
RUN apt-get install -y daemontools daemontools-run
|
||||||
|
@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|||||||
|
|
||||||
docker build -t internetarchive/warcprox-tests $script_dir
|
docker build -t internetarchive/warcprox-tests $script_dir
|
||||||
|
|
||||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
|
||||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||||
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
||||||
&& virtualenv -p python3 /tmp/venv \
|
&& virtualenv -p python3 /tmp/venv \
|
||||||
&& source /tmp/venv/bin/activate \
|
&& source /tmp/venv/bin/activate \
|
||||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
|
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
|
||||||
&& py.test -v tests \
|
&& py.test -v tests; \
|
||||||
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
svscan /etc/service & \
|
||||||
|
sleep 10; \
|
||||||
|
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||||
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
||||||
|
&& /usr/local/hadoop/hadoop-services.sh \
|
||||||
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -2106,6 +2106,40 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
assert set(extra_info.keys()) == {'exception'}
|
assert set(extra_info.keys()) == {'exception'}
|
||||||
|
|
||||||
|
#Verify non-ascii urls are encoded properly
|
||||||
|
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
|
||||||
|
headers = {
|
||||||
|
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
|
||||||
|
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
|
||||||
|
}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
|
||||||
|
|
||||||
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_8-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
|
assert os.path.exists(file)
|
||||||
|
crawl_log_8 = open(file, 'rb').read()
|
||||||
|
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
|
||||||
|
assert crawl_log_8[24:31] == b' 200 '
|
||||||
|
assert crawl_log_8[31:42] == b' 154 '
|
||||||
|
fields = crawl_log_8.split()
|
||||||
|
assert len(fields) == 13
|
||||||
|
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
|
||||||
|
assert fields[4] == b'L'
|
||||||
|
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
|
||||||
|
assert fields[6] == b'text/plain'
|
||||||
|
assert fields[7] == b'-'
|
||||||
|
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||||
|
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
|
||||||
|
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
|
||||||
|
assert fields[11] == b'-'
|
||||||
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
|
|
||||||
def test_long_warcprox_meta(
|
def test_long_warcprox_meta(
|
||||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
@ -25,6 +25,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import warcprox
|
import warcprox
|
||||||
import socket
|
import socket
|
||||||
|
import rfc3986
|
||||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
||||||
|
|
||||||
class CrawlLogger(object):
|
class CrawlLogger(object):
|
||||||
@ -67,8 +68,9 @@ class CrawlLogger(object):
|
|||||||
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
|
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
|
||||||
|
|
||||||
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
||||||
brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')
|
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
|
||||||
hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')
|
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||||
|
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||||
|
|
||||||
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
||||||
#No hop info headers provided
|
#No hop info headers provided
|
||||||
@ -81,13 +83,11 @@ class CrawlLogger(object):
|
|||||||
hop_via_url = "-"
|
hop_via_url = "-"
|
||||||
#Prefer referer header. Otherwise use provided via_url
|
#Prefer referer header. Otherwise use provided via_url
|
||||||
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
|
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
|
||||||
|
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
|
||||||
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
|
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
|
||||||
#Requested page is not the Brozzled url, thus we are an embed or redirect.
|
#Requested page is not the Brozzled url, thus we are an embed or redirect.
|
||||||
via_url = brozzled_url
|
via_url = brozzled_url
|
||||||
if hop_path == "-":
|
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
|
||||||
hop_path = "B"
|
|
||||||
else:
|
|
||||||
hop_path = "".join([hop_path,"B"])
|
|
||||||
|
|
||||||
fields = [
|
fields = [
|
||||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||||
@ -148,3 +148,15 @@ class CrawlLogger(object):
|
|||||||
else:
|
else:
|
||||||
return recorded_url.status
|
return recorded_url.status
|
||||||
|
|
||||||
|
def canonicalize_url(self, url):
|
||||||
|
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||||
|
#hostname will be idna encoded (punycode)
|
||||||
|
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||||
|
try:
|
||||||
|
parsed_url=rfc3986.urlparse(url)
|
||||||
|
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||||
|
return encoded_url.unsplit()
|
||||||
|
except (TypeError, ValueError) as e:
|
||||||
|
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||||
|
return url
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user