Merge branch 'tls-fingerprint' into qa

This commit is contained in:
Barbara Miller 2022-07-01 11:11:54 -07:00
commit ab172189fd
7 changed files with 164 additions and 30 deletions

View File

@ -35,6 +35,7 @@ deps = [
'idna==2.10',
'PyYAML>=5.1',
'cachetools',
'rfc3986>=1.5.0',
]
try:
import concurrent.futures
@ -43,7 +44,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.28',
version='2.4.29',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -19,7 +19,7 @@
# USA.
#
FROM phusion/baseimage
FROM ubuntu:focal-20220404
MAINTAINER Noah Levitt <nlevitt@archive.org>
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
@ -28,10 +28,11 @@ MAINTAINER Noah Levitt <nlevitt@archive.org>
ENV LANG=C.UTF-8
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
RUN apt-get install -y ca-certificates curl gnupg wget
# Add the RethinkDB repository and public key
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add -
RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
&& apt-get update && apt-get -y install rethinkdb
RUN mkdir -vp /etc/service/rethinkdb \
@ -57,25 +58,54 @@ RUN mkdir -vp /etc/service/tor \
&& chmod a+x /etc/service/tor/run
# hadoop hdfs for trough
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
&& . /etc/lsb-release \
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
RUN apt-get update
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get install -y openjdk-8-jdk openssh-server
RUN su hdfs -c 'hdfs namenode -format'
# set java home
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
# setup ssh with no passphrase
RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
&& cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
&& tar xfz hadoop-2.7.3.tar.gz \
&& mv /hadoop-2.7.3 /usr/local/hadoop \
&& rm /hadoop-2.7.3.tar.gz
# hadoop environment variables
ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
# hadoop-store
RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
&& mkdir -p $HADOOP_HOME/hdfs/datanode
RUN apt-get install -y libsqlite3-dev
# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
COPY config/ /tmp/
RUN mv /tmp/ssh_config $HOME/.ssh/config \
&& mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
&& mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
&& mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
&& mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
&& cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
&& mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
# Add startup script
ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
# set permissions
RUN chmod 744 -R $HADOOP_HOME
# format namenode
RUN $HADOOP_HOME/bin/hdfs namenode -format
# run hadoop services
#ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
RUN apt-get install -y libsqlite3-dev build-essential
# trough itself
RUN virtualenv -p python3 /opt/trough-ve3 \
@ -107,3 +137,4 @@ RUN mkdir -vp /etc/service/trough-segment-manager-server \
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
&& chmod a+x /etc/service/trough-segment-manager-server/run
RUN apt-get install -y daemontools daemontools-run

View File

@ -31,15 +31,18 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
docker build -t internetarchive/warcprox-tests $script_dir
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
&& (cd /warcprox && git diff HEAD) | patch -p1 \
&& virtualenv -p python3 /tmp/venv \
&& source /tmp/venv/bin/activate \
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
&& py.test -v tests \
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
&& py.test -v tests; \
svscan /etc/service & \
sleep 10; \
py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
&& /usr/local/hadoop/hadoop-services.sh \
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
"

View File

@ -68,6 +68,7 @@ import certauth.certauth
import warcprox
import warcprox.main
import warcprox.crawl_log as crawl_log
try:
import http.client as http_client
@ -2107,6 +2108,47 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
extra_info = json.loads(fields[12].decode('utf-8'))
assert set(extra_info.keys()) == {'exception'}
#Verify non-ascii urls are encoded properly
url = 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port
headers = {
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_8",
"metadata":{'seed': 'http://example.com/¶-non-ascii', 'hop_path': 'L', 'brozzled_url': 'http://localhost:%s/b/¶-non-ascii' % http_daemon.server_port, 'hop_via_url': 'http://чунджа.kz/b/¶-non-ascii'}}),
}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 200
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 9)
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_8-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
crawl_log_8 = open(file, 'rb').read()
assert re.match(br'\A2[^\n]+\n\Z', crawl_log_8)
assert crawl_log_8[24:31] == b' 200 '
assert crawl_log_8[31:42] == b' 154 '
fields = crawl_log_8.split()
assert len(fields) == 13
assert fields[3].endswith(b'/b/%C2%B6-non-ascii')
assert fields[4] == b'L'
assert fields[5].endswith(b'http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii')
assert fields[6] == b'text/plain'
assert fields[7] == b'-'
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
assert fields[9] == b'sha1:cdd841ea7c5e46fde3fba56b2e45e4df5aeec439'
assert fields[10].endswith('/¶-non-ascii'.encode('utf-8'))
assert fields[11] == b'-'
extra_info = json.loads(fields[12].decode('utf-8'))
def test_crawl_log_canonicalization():
assert crawl_log.canonicalize_url(None) is None
assert crawl_log.canonicalize_url("") is ''
assert crawl_log.canonicalize_url("-") == '-'
assert crawl_log.canonicalize_url("http://чунджа.kz/b/¶-non-ascii") == "http://xn--80ahg0a3ax.kz/b/%C2%B6-non-ascii"
assert crawl_log.canonicalize_url("Not a URL") == "Not a URL"
def test_long_warcprox_meta(
warcprox_, http_daemon, archiving_proxies, playback_proxies):
urls_before = warcprox_.proxy.running_stats.urls

View File

@ -177,6 +177,8 @@ class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
MAX_BATCH_SIZE = 500
MAX_BATCH_SEC = 60
MIN_BATCH_SEC = 30
# these updated batch seconds values have resulted in fewer reported dedup
# errors and otherwise have worked well in qa
def _get_process_put(self):
batch = []

View File

@ -25,6 +25,7 @@ import json
import os
import warcprox
import socket
import rfc3986
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
class CrawlLogger(object):
@ -65,22 +66,36 @@ class CrawlLogger(object):
content_length = 0
payload_digest = '-'
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path', '-')
if hop_path is None:
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
#URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided
hop_path = "-"
hop_path_referer = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path_referer', "-")
if hop_path_referer != recorded_url.url.decode('ascii'):
if hop_path == "-":
hop_path = "B"
else:
hop_path = "".join([hop_path,"B"])
via_url = recorded_url.referer or '-'
else:
if hop_path is None:
hop_path = "-"
if hop_via_url is None:
hop_via_url = "-"
#Prefer referer header. Otherwise use provided via_url
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
#Requested page is not the Brozzled url, thus we are an embed or redirect.
via_url = brozzled_url
hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % status,
'% 10s' % content_length,
recorded_url.url,
hop_path,
recorded_url.referer or hop_path_referer if hop_path != "-" else "-",
via_url,
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
'-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
@ -133,3 +148,17 @@ class CrawlLogger(object):
else:
return recorded_url.status
def canonicalize_url(url):
#URL needs to be split out to separately encode the hostname from the rest of the path.
#hostname will be idna encoded (punycode)
#The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
if url is None or url == '-' or url == '':
return url
try:
parsed_url=rfc3986.urlparse(url)
encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
return encoded_url.unsplit()
except (TypeError, ValueError, AttributeError) as e:
logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
return url

View File

@ -64,6 +64,7 @@ import ssl
import warcprox
import threading
import datetime
import random
import socks
import tempfile
import hashlib
@ -220,6 +221,28 @@ def via_header_value(orig, request_version):
via = via + '%s %s' % (request_version, 'warcprox')
return via
# Ref and detailed description about cipher selection at
# https://github.com/urllib3/urllib3/blob/f070ec2e6f6c545f40d9196e5246df10c72e48e1/src/urllib3/util/ssl_.py#L170
SSL_CIPHERS = [
"ECDHE+AESGCM",
"ECDHE+CHACHA20",
"DH+AESGCM",
"ECDH+AES",
"DH+AES",
"RSA+AESGCM",
"RSA+AES",
"!aNULL",
"!eNULL",
"!MD5",
"!DSS",
"!AESCCM",
"DHE+AESGCM",
"DHE+CHACHA20",
"ECDH+AESGCM",
]
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'''
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
@ -301,6 +324,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
# randomize TLS fingerprint to evade anti-web-bot systems
random.shuffle(SSL_CIPHERS)
context.set_ciphers(":".join(SSL_CIPHERS))
self._remote_server_conn.sock = context.wrap_socket(
self._remote_server_conn.sock,
server_hostname=self.hostname)