mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #171 from internetarchive/adds-hop-path-logging
Adds hop path logging
This commit is contained in:
commit
9521042a23
3
setup.py
3
setup.py
@ -35,6 +35,7 @@ deps = [
|
||||
'idna==2.10',
|
||||
'PyYAML>=5.1',
|
||||
'cachetools',
|
||||
'rfc3986>=1.5.0',
|
||||
]
|
||||
try:
|
||||
import concurrent.futures
|
||||
@ -43,7 +44,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4.28',
|
||||
version='2.4.29',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -19,7 +19,7 @@
|
||||
# USA.
|
||||
#
|
||||
|
||||
FROM phusion/baseimage
|
||||
FROM ubuntu:focal-20220404
|
||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
|
||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||
RUN apt-get install -y ca-certificates curl gnupg wget
|
||||
|
||||
# Add the RethinkDB repository and public key
|
||||
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
|
||||
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
|
||||
RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
&& apt-get update && apt-get -y install rethinkdb
|
||||
|
||||
RUN mkdir -vp /etc/service/rethinkdb \
|
||||
@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \
|
||||
&& chmod a+x /etc/service/tor/run
|
||||
|
||||
# hadoop hdfs for trough
|
||||
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
|
||||
&& . /etc/lsb-release \
|
||||
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=Etc/UTC
|
||||
RUN apt-get install -y openjdk-8-jdk openssh-server
|
||||
|
||||
RUN su hdfs -c 'hdfs namenode -format'
|
||||
# set java home
|
||||
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
|
||||
# setup ssh with no passphrase
|
||||
RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
|
||||
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
|
||||
RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
|
||||
&& tar xfz hadoop-2.7.3.tar.gz \
|
||||
&& mv /hadoop-2.7.3 /usr/local/hadoop \
|
||||
&& rm /hadoop-2.7.3.tar.gz
|
||||
|
||||
# hadoop environment variables
|
||||
ENV HADOOP_HOME=/usr/local/hadoop
|
||||
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
|
||||
|
||||
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
|
||||
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
|
||||
# hadoop-store
|
||||
RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
|
||||
&& mkdir -p $HADOOP_HOME/hdfs/datanode
|
||||
|
||||
RUN apt-get install -y libsqlite3-dev
|
||||
# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
|
||||
COPY config/ /tmp/
|
||||
RUN mv /tmp/ssh_config $HOME/.ssh/config \
|
||||
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
|
||||
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
|
||||
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
|
||||
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
|
||||
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
|
||||
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
|
||||
|
||||
# Add startup script
|
||||
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
|
||||
|
||||
# set permissions
|
||||
RUN chmod 744 -R $HADOOP_HOME
|
||||
|
||||
# format namenode
|
||||
RUN $HADOOP_HOME/bin/hdfs namenode -format
|
||||
|
||||
# run hadoop services
|
||||
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
|
||||
|
||||
RUN apt-get install -y libsqlite3-dev build-essential
|
||||
|
||||
# trough itself
|
||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
|
||||
&& chmod a+x /etc/service/trough-segment-manager-server/run
|
||||
|
||||
RUN apt-get install -y daemontools daemontools-run
|
||||
|
@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
docker build -t internetarchive/warcprox-tests $script_dir
|
||||
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
|
||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||
&& (cd /warcprox && git diff HEAD) | patch -p1 \
|
||||
&& virtualenv -p python3 /tmp/venv \
|
||||
&& source /tmp/venv/bin/activate \
|
||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
|
||||
&& py.test -v tests \
|
||||
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
|
||||
&& py.test -v tests; \
|
||||
svscan /etc/service & \
|
||||
sleep 10; \
|
||||
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
||||
&& /usr/local/hadoop/hadoop-services.sh \
|
||||
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
||||
"
|
||||
|
||||
|
@ -2106,6 +2106,40 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert set(extra_info.keys()) == {'exception'}
|
||||
|
||||
#Verify non-ascii urls are encoded properly
|
||||
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
|
||||
headers = {
|
||||
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
|
||||
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
|
||||
}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
|
||||
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_8-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_8 = open(file, 'rb').read()
|
||||
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
|
||||
assert crawl_log_8[24:31] == b' 200 '
|
||||
assert crawl_log_8[31:42] == b' 154 '
|
||||
fields = crawl_log_8.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
|
||||
assert fields[4] == b'L'
|
||||
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
|
||||
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
|
||||
def test_long_warcprox_meta(
|
||||
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
@ -25,6 +25,7 @@ import json
|
||||
import os
|
||||
import warcprox
|
||||
import socket
|
||||
import rfc3986
|
||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
||||
|
||||
class CrawlLogger(object):
|
||||
@ -64,13 +65,37 @@ class CrawlLogger(object):
|
||||
else:
|
||||
content_length = 0
|
||||
payload_digest = '-'
|
||||
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
|
||||
|
||||
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
|
||||
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
|
||||
brozzled_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
|
||||
hop_via_url = self.canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
|
||||
|
||||
if hop_path is None and brozzled_url is None and hop_via_url is None:
|
||||
#No hop info headers provided
|
||||
hop_path = "-"
|
||||
via_url = recorded_url.referer or '-'
|
||||
else:
|
||||
if hop_path is None:
|
||||
hop_path = "-"
|
||||
if hop_via_url is None:
|
||||
hop_via_url = "-"
|
||||
#Prefer referer header. Otherwise use provided via_url
|
||||
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
|
||||
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
|
||||
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
|
||||
#Requested page is not the Brozzled url, thus we are an embed or redirect.
|
||||
via_url = brozzled_url
|
||||
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
|
||||
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % status,
|
||||
'% 10s' % content_length,
|
||||
recorded_url.url,
|
||||
'-', # hop path
|
||||
recorded_url.referer or '-',
|
||||
hop_path,
|
||||
via_url,
|
||||
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
|
||||
'-',
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
@ -89,7 +114,6 @@ class CrawlLogger(object):
|
||||
except:
|
||||
pass
|
||||
line = b' '.join(fields) + b'\n'
|
||||
|
||||
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
||||
filename = '%s-%s-%s.log' % (
|
||||
prefix, self.hostname, self.options.server_port)
|
||||
@ -124,3 +148,15 @@ class CrawlLogger(object):
|
||||
else:
|
||||
return recorded_url.status
|
||||
|
||||
def canonicalize_url(self, url):
|
||||
#URL needs to be split out to separately encode the hostname from the rest of the path.
|
||||
#hostname will be idna encoded (punycode)
|
||||
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
|
||||
try:
|
||||
parsed_url=rfc3986.urlparse(url)
|
||||
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
|
||||
return encoded_url.unsplit()
|
||||
except (TypeError, ValueError) as e:
|
||||
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
|
||||
return url
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user