Merge pull request #171 from internetarchive/adds-hop-path-logging

Adds hop path logging
This commit is contained in:
Adam Miller 2022-04-26 12:11:11 -07:00 committed by GitHub
commit 9521042a23
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 129 additions and 24 deletions

View File

@ -35,6 +35,7 @@ deps = [
'idna==2.10', 'idna==2.10',
'PyYAML>=5.1', 'PyYAML>=5.1',
'cachetools', 'cachetools',
'rfc3986>=1.5.0',
] ]
try: try:
import concurrent.futures import concurrent.futures
@ -43,7 +44,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.28', version='2.4.29',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -19,7 +19,7 @@
# USA. # USA.
# #
FROM phusion/baseimage FROM ubuntu:focal-20220404
MAINTAINER Noah Levitt <nlevitt@archive.org> MAINTAINER Noah Levitt <nlevitt@archive.org>
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile # see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
RUN apt-get update && apt-get --auto-remove -y dist-upgrade RUN apt-get update && apt-get --auto-remove -y dist-upgrade
RUN apt-get install -y ca-certificates curl gnupg wget
# Add the RethinkDB repository and public key # Add the RethinkDB repository and public key
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \ RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \ RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
&& apt-get update && apt-get -y install rethinkdb && apt-get update && apt-get -y install rethinkdb
RUN mkdir -vp /etc/service/rethinkdb \ RUN mkdir -vp /etc/service/rethinkdb \
@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \
&& chmod a+x /etc/service/tor/run && chmod a+x /etc/service/tor/run
# hadoop hdfs for trough # hadoop hdfs for trough
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
&& . /etc/lsb-release \
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
RUN apt-get update ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo ENV TZ=Etc/UTC
RUN apt-get install -y openjdk-8-jdk openssh-server
RUN su hdfs -c 'hdfs namenode -format' # set java home
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \ # setup ssh with no passphrase
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \ RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml && tar xfz hadoop-2.7.3.tar.gz \
&& mv /hadoop-2.7.3 /usr/local/hadoop \
&& rm /hadoop-2.7.3.tar.gz
# hadoop environment variables
ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \ # hadoop-store
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
&& mkdir -p $HADOOP_HOME/hdfs/datanode
RUN apt-get install -y libsqlite3-dev # Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
COPY config/ /tmp/
RUN mv /tmp/ssh_config $HOME/.ssh/config \
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
# Add startup script
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
# set permissions
RUN chmod 744 -R $HADOOP_HOME
# format namenode
RUN $HADOOP_HOME/bin/hdfs namenode -format
# run hadoop services
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
RUN apt-get install -y libsqlite3-dev build-essential
# trough itself # trough itself
RUN virtualenv -p python3 /opt/trough-ve3 \ RUN virtualenv -p python3 /opt/trough-ve3 \
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \ && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
&& chmod a+x /etc/service/trough-segment-manager-server/run && chmod a+x /etc/service/trough-segment-manager-server/run
RUN apt-get install -y daemontools daemontools-run

View File

@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
docker build -t internetarchive/warcprox-tests $script_dir docker build -t internetarchive/warcprox-tests $script_dir
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
&& (cd /warcprox && git diff HEAD) | patch -p1 \ && (cd /warcprox && git diff HEAD) | patch -p1 \
&& virtualenv -p python3 /tmp/venv \ && virtualenv -p python3 /tmp/venv \
&& source /tmp/venv/bin/activate \ && source /tmp/venv/bin/activate \
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \ && pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
&& py.test -v tests \ && py.test -v tests; \
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ svscan /etc/service & \
sleep 10; \
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \ && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
&& /usr/local/hadoop/hadoop-services.sh \
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \ && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
" "

View File

@ -2106,6 +2106,40 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
extra_info = json.loads(fields[12].decode('utf-8')) extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {'exception'} assert set(extra_info.keys()) == {'exception'}
#Verify non-ascii urls are encoded properly
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
headers = {
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 200
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_8-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
crawl_log_8 = open(file, 'rb').read()
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
assert crawl_log_8[24:31] == b' 200 '
assert crawl_log_8[31:42] == b' 154 '
fields = crawl_log_8.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
assert fields[4] == b'L'
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
def test_long_warcprox_meta( def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies): warcprox_, http_daemon, archiving_proxies, playback_proxies):
urls_before = warcprox_.proxy.running_stats.urls urls_before = warcprox_.proxy.running_stats.urls

View File

@ -25,6 +25,7 @@ import json
import os import os
import warcprox import warcprox
import socket import socket
import rfc3986
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
class CrawlLogger(object): class CrawlLogger(object):
@ -64,13 +65,37 @@ class CrawlLogger(object):
else: else:
content_length = 0 content_length = 0
payload_digest = '-' payload_digest = '-'
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided
hop_path = "-"
via_url = recorded_url.referer or '-'
else:
if hop_path is None:
hop_path = "-"
if hop_via_url is None:
hop_via_url = "-"
#Prefer referer header. Otherwise use provided via_url
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
#Requested page is not the Brozzled url, thus we are an embed or redirect.
via_url = brozzled_url
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
fields = [ fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % status, '% 5s' % status,
'% 10s' % content_length, '% 10s' % content_length,
recorded_url.url, recorded_url.url,
'-', # hop path hop_path,
recorded_url.referer or '-', via_url,
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-', recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
'-', '-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
@ -89,7 +114,6 @@ class CrawlLogger(object):
except: except:
pass pass
line = b' '.join(fields) + b'\n' line = b' '.join(fields) + b'\n'
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl') prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
filename = '%s-%s-%s.log' % ( filename = '%s-%s-%s.log' % (
prefix, self.hostname, self.options.server_port) prefix, self.hostname, self.options.server_port)
@ -124,3 +148,15 @@ class CrawlLogger(object):
else: else:
return recorded_url.status return recorded_url.status
def canonicalize_url(self, url):
#URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
try:
parsed_url=rfc3986.urlparse(url)
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
return encoded_url.unsplit()
except (TypeError, ValueError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url