mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'trough-dedup' into qa
* trough-dedup: update travis-ci trough deployment on error from trough read or write url, delete read/write url from cache, so next request will retrieve a fresh, hopefully working, url (n.b. not covered by automated tests at this point) cache trough read and write urls update trough dedup to use new segment manager api to register schema sql it finally works! another travis tweak though can we edit /etc/hosts in travis-ci? ugh fix docker command line arg docker container for trough needs a hostname that works from outside the container (since it registers itself in the service registry) trough logs are inside the docker container now need docker to publish the rethinkdb port for --rethinkdb-dedup-url and --rethinkdb-big-table-url tests apparently you can't use docker run options --rm and --detach together in travis-ci, run trough in another docker container, so that its version of python can be independent of the one used to run the warcprox tests remove some debugging from .travis.yml and importantly, deactivate the trough virtualenv before installing warcprox and running tests (otherwise it uses the wrong version of python) missed an ampersand bangin (is the problem that we didn't start trough-read? banging more banging on travis-ci cryptography 2.1.1 seems to be the problem banging on travis-ci first attempt to run trough on travis-ci get all the tests to pass with ./tests/run-tests.sh install and run trough in docker container for testing change rethinkdb-related command line options to use "rethinkdb urls" (parser just added to doublethink) to reduce the proliferation of rethinkdb options, and add --rethinkdb-trough-db-url option trough for deduplication initial proof-of-concept-ish code
This commit is contained in:
commit
7ef2133628
33
.travis.yml
33
.travis.yml
@ -1,4 +1,4 @@
|
||||
group: deprecated-2017Q2
|
||||
sudo: required
|
||||
|
||||
language: python
|
||||
python:
|
||||
@ -27,15 +27,38 @@ services:
|
||||
|
||||
before_install:
|
||||
- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778
|
||||
- docker run -d --publish=28015:28015 rethinkdb
|
||||
- docker network create --driver=bridge trough
|
||||
- docker run --detach --network=trough --hostname=rethinkdb --name=rethinkdb --publish=28015:28015 rethinkdb
|
||||
- docker run --detach --network=trough --hostname=hadoop --name=hadoop chalimartines/cdh5-pseudo-distributed
|
||||
- docker run --detach --network=trough --hostname=trough --volume="$PWD/tests/run-trough.sh:/run-trough.sh" --publish=6111:6111 --publish=6112:6112 --publish=6222:6222 --publish=6444:6444 python:3 bash /run-trough.sh
|
||||
- cat /etc/hosts
|
||||
- echo | sudo tee -a /etc/hosts # travis-ci default doesn't end with a newline 🙄
|
||||
- echo 127.0.0.1 rethinkdb | sudo tee -a /etc/hosts
|
||||
- echo 127.0.0.1 hadoop | sudo tee -a /etc/hosts
|
||||
- echo 127.0.0.1 trough | sudo tee -a /etc/hosts
|
||||
- cat /etc/hosts
|
||||
- ping -c2 trough
|
||||
|
||||
install:
|
||||
- pip install . pytest requests warcio
|
||||
|
||||
before_script:
|
||||
- pip install . pytest requests warcio
|
||||
- ps ww -fHe
|
||||
|
||||
script:
|
||||
- py.test -v tests
|
||||
- py.test -v --rethinkdb-servers=localhost tests
|
||||
- py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests
|
||||
- py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests
|
||||
- py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests
|
||||
- py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests
|
||||
|
||||
after_script:
|
||||
- ps ww -fHe
|
||||
- docker exec trough cat /tmp/trough-write.out
|
||||
- docker exec trough cat /tmp/trough-write-provisioner-server.out
|
||||
- docker exec trough cat /tmp/trough-write-provisioner-local.out
|
||||
- docker exec trough cat /tmp/trough-sync-server.out
|
||||
- docker exec trough cat /tmp/trough-sync-local.out
|
||||
- docker exec trough cat /tmp/trough-read.out
|
||||
|
||||
notifications:
|
||||
slack:
|
||||
|
2
setup.py
2
setup.py
@ -39,8 +39,8 @@ deps = [
|
||||
'certauth==1.1.6',
|
||||
'warctools',
|
||||
'urlcanon>=0.1.dev16',
|
||||
'doublethink>=0.2.0.dev87',
|
||||
'urllib3',
|
||||
'doublethink>=0.2.0.dev81',
|
||||
'PySocks',
|
||||
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
|
||||
]
|
||||
|
@ -1,7 +1,7 @@
|
||||
#
|
||||
# Dockerfile for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
# Copyright (C) 2015-2017 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
@ -23,19 +23,19 @@ FROM phusion/baseimage
|
||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
|
||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||
# and https://github.com/chali/hadoop-cdh-pseudo-docker/blob/master/Dockerfile
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||
|
||||
# Add the RethinkDB repository and public key
|
||||
# "RethinkDB Packaging <packaging@rethinkdb.com>" http://download.rethinkdb.com/apt/pubkey.gpg
|
||||
RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \
|
||||
RUN curl -s https://download.rethinkdb.com/apt/pubkey.gpg | apt-key add - \
|
||||
&& echo "deb http://download.rethinkdb.com/apt xenial main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
&& apt-get update && apt-get -y install rethinkdb
|
||||
|
||||
RUN mkdir -vp /etc/service/rethinkdb \
|
||||
&& echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
|
||||
&& echo "#!/bin/bash\nexec rethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
|
||||
&& chmod a+x /etc/service/rethinkdb/run
|
||||
|
||||
RUN apt-get -y install git
|
||||
@ -53,6 +53,55 @@ RUN pip install virtualenv
|
||||
|
||||
RUN apt-get -y install tor
|
||||
RUN mkdir -vp /etc/service/tor \
|
||||
&& echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \
|
||||
&& echo "#!/bin/sh\nexec tor\n" > /etc/service/tor/run \
|
||||
&& chmod a+x /etc/service/tor/run
|
||||
|
||||
# hadoop hdfs for trough
|
||||
RUN curl -s https://archive.cloudera.com/cdh5/ubuntu/xenial/amd64/cdh/archive.key | apt-key add - \
|
||||
&& . /etc/lsb-release \
|
||||
&& echo "deb [arch=amd64] http://archive.cloudera.com/cdh5/ubuntu/$DISTRIB_CODENAME/amd64/cdh $DISTRIB_CODENAME-cdh5 contrib" >> /etc/apt/sources.list.d/cloudera.list
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y openjdk-8-jdk hadoop-conf-pseudo
|
||||
|
||||
RUN su hdfs -c 'hdfs namenode -format'
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/core-site.xml /etc/hadoop/conf/core-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/core-site.xml.orig | sed 's,localhost:8020,0.0.0.0:8020,' > /etc/hadoop/conf/core-site.xml
|
||||
|
||||
RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
|
||||
&& cat /etc/hadoop/conf/hdfs-site.xml.orig | sed 's,^</configuration>$, <property>\n <name>dfs.permissions.enabled</name>\n <value>false</value>\n </property>\n</configuration>,' > /etc/hadoop/conf/hdfs-site.xml
|
||||
|
||||
RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
|
||||
&& chmod a+x /etc/my_init.d/50_start_hdfs.sh
|
||||
|
||||
# trough itself
|
||||
RUN virtualenv -p python3 /opt/trough-ve3 \
|
||||
&& . /opt/trough-ve3/bin/activate \
|
||||
&& pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \
|
||||
&& pip install git+https://github.com/nlevitt/trough.git@toward-warcprox-dedup
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-sync-local \
|
||||
&& echo "#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nexec sync.py >>/tmp/trough-sync-local.out 2>&1" > /etc/service/trough-sync-local/run \
|
||||
&& chmod a+x /etc/service/trough-sync-local/run
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-sync-server \
|
||||
&& echo '#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec sync.py --server >>/tmp/trough-sync-server.out 2>&1' > /etc/service/trough-sync-server/run \
|
||||
&& chmod a+x /etc/service/trough-sync-server/run
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-read \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6444 --master --processes=2 --harakiri=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/reader.py >>/tmp/trough-read.out 2>&1' > /etc/service/trough-read/run \
|
||||
&& chmod a+x /etc/service/trough-read/run
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-write \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/writer.py >>/tmp/trough-write.out 2>&1' > /etc/service/trough-write/run \
|
||||
&& chmod a+x /etc/service/trough-write/run
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-write-provisioner-local \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_local.py >>/tmp/trough-write-provisioner-local.out 2>&1' > /etc/service/trough-write-provisioner-local/run \
|
||||
&& chmod a+x /etc/service/trough-write-provisioner-local/run
|
||||
|
||||
RUN mkdir -vp /etc/service/trough-write-provisioner-server \
|
||||
&& echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_server.py >>/tmp/trough-write-provisioner-server.out 2>&1' > /etc/service/trough-write-provisioner-server/run \
|
||||
&& chmod a+x /etc/service/trough-write-provisioner-server/run
|
||||
|
||||
|
@ -1,39 +1,41 @@
|
||||
#
|
||||
# tests/conftest.py - command line options for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
# vim: set fileencoding=utf-8:
|
||||
'''
|
||||
tests/conftest.py - command line options for warcprox tests
|
||||
|
||||
Copyright (C) 2015-2017 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
import pytest
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
parser.addoption('--rethinkdb-big-table',
|
||||
dest='rethinkdb_big_table', action='store_true', default=False,
|
||||
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinkdb_servers(request):
|
||||
return request.config.getoption("--rethinkdb-servers")
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinkdb_big_table(request):
|
||||
return request.config.getoption("--rethinkdb-big-table")
|
||||
|
||||
parser.addoption(
|
||||
'--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
|
||||
'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
|
||||
'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
|
||||
parser.addoption(
|
||||
'--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
|
||||
'rethinkdb big table url (table will be populated with '
|
||||
'various capture information and is suitable for use as '
|
||||
'index for playback), e.g. rethinkdb://db0.foo.org,'
|
||||
'db1.foo.org:38015/my_warcprox_db/captures'))
|
||||
parser.addoption(
|
||||
'--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
|
||||
'🐷 url pointing to trough configuration rethinkdb database, '
|
||||
'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
|
||||
'/trough_configuration'))
|
||||
|
||||
|
@ -5,8 +5,6 @@
|
||||
# features enabled, against that instance of rethinkdb, and also run without
|
||||
# rethinkdb features enabled. With python 2.7 and 3.4.
|
||||
#
|
||||
# tests/conftest.py - command line options for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2017 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
@ -33,7 +31,7 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
docker build -t internetarchive/warcprox-tests $script_dir
|
||||
|
||||
for python in python2.7 python3
|
||||
for python in python3 python2.7
|
||||
do
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||
@ -42,7 +40,9 @@ do
|
||||
&& source /tmp/venv/bin/activate \
|
||||
&& pip --log-file /tmp/pip.log install . pytest mock requests warcio \
|
||||
&& py.test -v tests \
|
||||
&& py.test -v --rethinkdb-servers=localhost tests \
|
||||
&& py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests"
|
||||
&& py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
|
||||
&& py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
|
||||
&& py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
|
||||
"
|
||||
done
|
||||
|
||||
|
34
tests/run-trough.sh
Normal file
34
tests/run-trough.sh
Normal file
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# this is used by .travis.yml
|
||||
#
|
||||
|
||||
pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string
|
||||
pip install git+https://github.com/internetarchive/trough.git@toward-warcprox-dedup
|
||||
|
||||
mkdir /etc/trough
|
||||
|
||||
# hello docker user-defined bridge networking
|
||||
echo '
|
||||
HDFS_HOST: hadoop
|
||||
RETHINKDB_HOSTS:
|
||||
- rethinkdb
|
||||
' > /etc/trough/settings.yml
|
||||
|
||||
sync.py >>/tmp/trough-sync-local.out 2>&1 &
|
||||
|
||||
sleep 5
|
||||
python -c "
|
||||
import doublethink
|
||||
from trough.settings import settings
|
||||
rr = doublethink.Rethinker(settings['RETHINKDB_HOSTS'])
|
||||
rr.db('trough_configuration').wait().run()"
|
||||
|
||||
sync.py --server >>/tmp/trough-sync-server.out 2>&1 &
|
||||
uwsgi --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file /usr/local/bin/writer.py >>/tmp/trough-write.out 2>&1 &
|
||||
uwsgi --http :6112 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1 &
|
||||
uwsgi --http :6111 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1 &
|
||||
uwsgi --http :6444 --master --processes=2 --harakiri=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file /usr/local/bin/reader.py >>/tmp/trough-read.out 2>&1 &
|
||||
|
||||
wait
|
||||
|
@ -84,7 +84,8 @@ def _send(self, data):
|
||||
# http_client.HTTPConnection.send = _send
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
|
||||
# stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
|
||||
stream=sys.stdout, level=warcprox.TRACE,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
@ -242,7 +243,7 @@ def https_daemon(request, cert):
|
||||
return https_daemon
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def warcprox_(request, rethinkdb_servers, rethinkdb_big_table):
|
||||
def warcprox_(request):
|
||||
orig_dir = os.getcwd()
|
||||
work_dir = tempfile.mkdtemp()
|
||||
logging.info('changing to working directory %r', work_dir)
|
||||
@ -255,12 +256,15 @@ def warcprox_(request, rethinkdb_servers, rethinkdb_big_table):
|
||||
'--playback-port=0',
|
||||
'--onion-tor-socks-proxy=localhost:9050',
|
||||
'--crawl-log-dir=crawl-logs']
|
||||
if rethinkdb_servers:
|
||||
rethinkdb_db = 'warcprox_test_%s' % ''.join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
||||
argv.append('--rethinkdb-servers=%s' % rethinkdb_servers)
|
||||
argv.append('--rethinkdb-db=%s' % rethinkdb_db)
|
||||
if rethinkdb_big_table:
|
||||
argv.append('--rethinkdb-big-table')
|
||||
if request.config.getoption('--rethinkdb-dedup-url'):
|
||||
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
|
||||
# test these here only
|
||||
argv.append('--rethinkdb-stats-url=rethinkdb://localhost/test0/stats')
|
||||
argv.append('--rethinkdb-services-url=rethinkdb://localhost/test0/services')
|
||||
elif request.config.getoption('--rethinkdb-big-table-url'):
|
||||
argv.append('--rethinkdb-big-table-url=%s' % request.config.getoption('--rethinkdb-big-table-url'))
|
||||
elif request.config.getoption('--rethinkdb-trough-db-url'):
|
||||
argv.append('--rethinkdb-trough-db-url=%s' % request.config.getoption('--rethinkdb-trough-db-url'))
|
||||
|
||||
args = warcprox.main.parse_args(argv)
|
||||
warcprox_ = warcprox.main.init_controller(args)
|
||||
@ -273,10 +277,22 @@ def warcprox_(request, rethinkdb_servers, rethinkdb_big_table):
|
||||
def fin():
|
||||
warcprox_.stop.set()
|
||||
warcprox_thread.join()
|
||||
if rethinkdb_servers:
|
||||
logging.info('dropping rethinkdb database %r', rethinkdb_db)
|
||||
rr = doublethink.Rethinker(rethinkdb_servers)
|
||||
result = rr.db_drop(rethinkdb_db).run()
|
||||
for rethinkdb_url in (
|
||||
warcprox_.options.rethinkdb_big_table_url,
|
||||
warcprox_.options.rethinkdb_dedup_url,
|
||||
warcprox_.options.rethinkdb_services_url,
|
||||
warcprox_.options.rethinkdb_stats_url):
|
||||
if not rethinkdb_url:
|
||||
continue
|
||||
parsed = doublethink.parse_rethinkdb_url(rethinkdb_url)
|
||||
rr = doublethink.Rethinker(servers=parsed.hosts)
|
||||
try:
|
||||
logging.info('dropping rethinkdb database %r', parsed.database)
|
||||
rr.db_drop(parsed.database).run()
|
||||
except Exception as e:
|
||||
logging.warn(
|
||||
'problem deleting rethinkdb database %r: %s',
|
||||
parsed.database, e)
|
||||
logging.info('deleting working directory %r', work_dir)
|
||||
os.chdir(orig_dir)
|
||||
shutil.rmtree(work_dir)
|
||||
@ -410,6 +426,7 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies)
|
||||
# {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup(
|
||||
b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
assert dedup_lookup
|
||||
assert dedup_lookup['url'] == url.encode('ascii')
|
||||
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||
@ -483,6 +500,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
|
||||
# {u'id': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = warcprox_.warc_writer_threads[0].dedup_db.lookup(
|
||||
b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
assert dedup_lookup
|
||||
assert dedup_lookup['url'] == url.encode('ascii')
|
||||
assert re.match(br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$', dedup_lookup['id'])
|
||||
assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date'])
|
||||
|
@ -39,13 +39,12 @@ class RethinkCaptures:
|
||||
"""Inserts in batches every 0.5 seconds"""
|
||||
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||
|
||||
def __init__(
|
||||
self, rr, table="captures", shards=None, replicas=None,
|
||||
options=warcprox.Options()):
|
||||
self.rr = rr
|
||||
self.table = table
|
||||
self.shards = shards or len(rr.servers)
|
||||
self.replicas = replicas or min(3, len(rr.servers))
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
parsed = doublethink.parse_rethinkdb_url(
|
||||
options.rethinkdb_big_table_url)
|
||||
self.rr = doublethink.Rethinker(
|
||||
servers=parsed.hosts, db=parsed.database)
|
||||
self.table = parsed.table
|
||||
self.options = options
|
||||
self._ensure_db_table()
|
||||
|
||||
@ -107,7 +106,9 @@ class RethinkCaptures:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table %r in database %r",
|
||||
self.table, self.rr.dbname)
|
||||
self.rr.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
||||
self.rr.table_create(
|
||||
self.table, shards=len(self.rr.servers),
|
||||
replicas=min(3, len(self.rr.servers))).run()
|
||||
self.rr.table(self.table).index_create(
|
||||
"abbr_canon_surt_timestamp",
|
||||
[r.row["abbr_canon_surt"], r.row["timestamp"]]).run()
|
||||
@ -216,8 +217,8 @@ class RethinkCaptures:
|
||||
class RethinkCapturesDedup:
|
||||
logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
|
||||
|
||||
def __init__(self, captures_db, options=warcprox.Options()):
|
||||
self.captures_db = captures_db
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.captures_db = RethinkCaptures(options=options)
|
||||
self.options = options
|
||||
|
||||
def lookup(self, digest_key, bucket="__unspecified__", url=None):
|
||||
@ -247,3 +248,7 @@ class RethinkCapturesDedup:
|
||||
|
||||
def close(self):
|
||||
self.captures_db.close()
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.captures_db.notify(recorded_url, records)
|
||||
|
||||
|
@ -21,13 +21,16 @@ USA.
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import sqlite3
|
||||
import requests
|
||||
import doublethink
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
import urllib3
|
||||
from urllib3.exceptions import HTTPError
|
||||
|
||||
@ -121,11 +124,11 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||
class RethinkDedupDb:
|
||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||
|
||||
def __init__(self, rr, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.rr = rr
|
||||
self.table = table
|
||||
self.shards = shards or len(rr.servers)
|
||||
self.replicas = replicas or min(3, len(rr.servers))
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url)
|
||||
self.rr = doublethink.Rethinker(
|
||||
servers=parsed.hosts, db=parsed.database)
|
||||
self.table = parsed.table
|
||||
self._ensure_db_table()
|
||||
self.options = options
|
||||
|
||||
@ -138,12 +141,11 @@ class RethinkDedupDb:
|
||||
if not self.table in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table %r in database %r shards=%r "
|
||||
"replicas=%r", self.table, self.rr.dbname, self.shards,
|
||||
self.replicas)
|
||||
"replicas=%r", self.table, self.rr.dbname,
|
||||
len(self.rr.servers), min(3, len(self.rr.servers)))
|
||||
self.rr.table_create(
|
||||
self.table, primary_key="key", shards=self.shards,
|
||||
replicas=self.replicas).run()
|
||||
|
||||
self.table, primary_key="key", shards=len(self.rr.servers),
|
||||
replicas=min(3, len(self.rr.servers))).run()
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
@ -181,7 +183,6 @@ class RethinkDedupDb:
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
|
||||
class CdxServerDedup(object):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
@ -231,8 +232,8 @@ class CdxServerDedup(object):
|
||||
if line:
|
||||
(cdx_ts, cdx_digest) = line.split(b' ')
|
||||
if cdx_digest == dkey:
|
||||
dt = datetime.strptime(cdx_ts.decode('ascii'),
|
||||
'%Y%m%d%H%M%S')
|
||||
dt = datetime.datetime.strptime(
|
||||
cdx_ts.decode('ascii'), '%Y%m%d%H%M%S')
|
||||
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
|
||||
return dict(url=url, date=date)
|
||||
except (HTTPError, AssertionError, ValueError) as exc:
|
||||
@ -244,3 +245,197 @@ class CdxServerDedup(object):
|
||||
"""Since we don't save anything to CDX server, this does not apply.
|
||||
"""
|
||||
pass
|
||||
|
||||
class TroughClient(object):
|
||||
logger = logging.getLogger("warcprox.dedup.TroughClient")
|
||||
|
||||
def __init__(self, rethinkdb_trough_db_url):
|
||||
parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
|
||||
self.rr = doublethink.Rethinker(
|
||||
servers=parsed.hosts, db=parsed.database)
|
||||
self.svcreg = doublethink.ServiceRegistry(self.rr)
|
||||
|
||||
def segment_manager_url(self):
|
||||
master_node = self.svcreg.unique_service('trough-sync-master')
|
||||
assert master_node
|
||||
return master_node['url']
|
||||
|
||||
def write_url(self, segment_id, schema_id='default'):
|
||||
provision_url = os.path.join(self.segment_manager_url(), 'provision')
|
||||
payload_dict = {'segment': segment_id, 'schema': schema_id}
|
||||
response = requests.post(provision_url, json=payload_dict)
|
||||
if response.status_code != 200:
|
||||
raise Exception(
|
||||
'Received %s: %r in response to POST %s with data %s' % (
|
||||
response.status_code, response.text, provision_url,
|
||||
json.dumps(payload_dict)))
|
||||
result_dict = response.json()
|
||||
# assert result_dict['schema'] == schema_id # previously provisioned?
|
||||
return result_dict['write_url']
|
||||
|
||||
def read_url(self, segment_id):
|
||||
reql = self.rr.table('services').get_all(
|
||||
segment_id, index='segment').filter(
|
||||
{'role':'trough-read'}).filter(
|
||||
lambda svc: r.now().sub(
|
||||
svc['last_heartbeat']).lt(svc['ttl'])
|
||||
).order_by('load')
|
||||
logging.debug('querying rethinkdb: %r', reql)
|
||||
results = reql.run()
|
||||
if results:
|
||||
return results[0]['url']
|
||||
else:
|
||||
return None
|
||||
|
||||
def schema_exists(self, schema_id):
|
||||
url = os.path.join(self.segment_manager_url(), 'schema', schema_id)
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
elif response.status_code == 404:
|
||||
return False
|
||||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
def register_schema(self, schema_id, sql):
|
||||
url = '%s/schema/%s/sql' % (self.segment_manager_url(), schema_id)
|
||||
response = requests.put(url, sql)
|
||||
if response.status_code not in (201, 204):
|
||||
raise Exception(
|
||||
'Received %s: %r in response to PUT %r with data %r' % (
|
||||
response.status_code, response.text, sql, url))
|
||||
|
||||
class TroughDedupDb(object):
|
||||
'''
|
||||
https://github.com/internetarchive/trough
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.dedup.TroughDedupDb")
|
||||
|
||||
SCHEMA_ID = 'warcprox-dedup-v1'
|
||||
SCHEMA_SQL = ('create table dedup (\n'
|
||||
' digest_key varchar(100) primary key,\n'
|
||||
' url varchar(2100) not null,\n'
|
||||
' date datetime not null,\n'
|
||||
' id varchar(100));\n') # warc record id
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.options = options
|
||||
self._trough_cli = TroughClient(options.rethinkdb_trough_db_url)
|
||||
self._write_url_cache = {}
|
||||
self._read_url_cache = {}
|
||||
|
||||
def start(self):
|
||||
self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL)
|
||||
|
||||
def _write_url(self, bucket):
|
||||
if not bucket in self._write_url_cache:
|
||||
segment_id = 'warcprox-trough-%s' % bucket
|
||||
self._write_url_cache[bucket] = self._trough_cli.write_url(
|
||||
segment_id, self.SCHEMA_ID)
|
||||
logging.info(
|
||||
'bucket %r write url is %r', bucket,
|
||||
self._write_url_cache[bucket])
|
||||
return self._write_url_cache[bucket]
|
||||
|
||||
def _read_url(self, bucket):
|
||||
if not self._read_url_cache.get(bucket):
|
||||
segment_id = 'warcprox-trough-%s' % bucket
|
||||
self._read_url_cache[bucket] = self._trough_cli.read_url(segment_id)
|
||||
logging.info(
|
||||
'bucket %r read url is %r', bucket,
|
||||
self._read_url_cache[bucket])
|
||||
return self._read_url_cache[bucket]
|
||||
|
||||
def sql_value(self, x):
|
||||
if x is None:
|
||||
return 'null'
|
||||
elif isinstance(x, datetime.datetime):
|
||||
return 'datetime(%r)' % x.isoformat()
|
||||
elif isinstance(x, bool):
|
||||
return int(x)
|
||||
elif isinstance(x, str) or isinstance(x, bytes):
|
||||
# py3: repr(u'abc') => 'abc'
|
||||
# repr(b'abc') => b'abc'
|
||||
# py2: repr(u'abc') => u'abc'
|
||||
# repr(b'abc') => 'abc'
|
||||
# Repr gives us a prefix we don't want in different situations
|
||||
# depending on whether this is py2 or py3. Chop it off either way.
|
||||
r = repr(x)
|
||||
if r[:1] == "'":
|
||||
return r
|
||||
else:
|
||||
return r[1:]
|
||||
else:
|
||||
raise Exception("don't know how to make an sql value from %r" % x)
|
||||
|
||||
def save(self, digest_key, response_record, bucket='__unspecified__'):
|
||||
write_url = self._write_url(bucket)
|
||||
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||
warc_date = response_record.get_header(warctools.WarcRecord.DATE)
|
||||
|
||||
sql = ('insert into dedup (digest_key, url, date, id) '
|
||||
'values (%s, %s, %s, %s);') % (
|
||||
self.sql_value(digest_key), self.sql_value(url),
|
||||
self.sql_value(warc_date), self.sql_value(record_id))
|
||||
try:
|
||||
response = requests.post(write_url, sql)
|
||||
except:
|
||||
logging.error(
|
||||
'problem with trough write url %r', write_url,
|
||||
exc_info=True)
|
||||
del self._write_url_cache[bucket]
|
||||
return
|
||||
if response.status_code != 200:
|
||||
del self._write_url_cache[bucket]
|
||||
logging.warn(
|
||||
'unexpected response %r %r %r to sql=%r',
|
||||
response.status_code, response.reason, response.text, sql)
|
||||
else:
|
||||
logging.trace('posted %r to %s', sql, write_url)
|
||||
|
||||
def lookup(self, digest_key, bucket='__unspecified__', url=None):
|
||||
read_url = self._read_url(bucket)
|
||||
if not read_url:
|
||||
return None
|
||||
sql = 'select * from dedup where digest_key=%s;' % (
|
||||
self.sql_value(digest_key))
|
||||
try:
|
||||
response = requests.post(read_url, sql)
|
||||
except:
|
||||
logging.error(
|
||||
'problem with trough read url %r', read_url, exc_info=True)
|
||||
del self._read_url_cache[bucket]
|
||||
return None
|
||||
if response.status_code != 200:
|
||||
del self._read_url_cache[bucket]
|
||||
logging.warn(
|
||||
'unexpected response %r %r %r to sql=%r',
|
||||
response.status_code, response.reason, response.text, sql)
|
||||
return None
|
||||
logging.debug('got %r from query %r', response.text, sql)
|
||||
results = json.loads(response.text)
|
||||
assert len(results) <= 1 # sanity check (digest_key is primary key)
|
||||
if results:
|
||||
result = results[0]
|
||||
result['id'] = result['id'].encode('ascii')
|
||||
result['url'] = result['url'].encode('ascii')
|
||||
result['date'] = result['date'].encode('ascii')
|
||||
self.logger.debug(
|
||||
'trough lookup of key=%r returning %r', digest_key, result)
|
||||
return result
|
||||
else:
|
||||
return None
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(
|
||||
recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
if recorded_url.warcprox_meta and 'captures-bucket' in recorded_url.warcprox_meta:
|
||||
self.save(
|
||||
digest_key, records[0],
|
||||
bucket=recorded_url.warcprox_meta['captures-bucket'])
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set fileencoding=utf-8:
|
||||
'''
|
||||
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||
arguments, initializes components, starts controller, handles signals
|
||||
@ -42,6 +43,7 @@ import warcprox
|
||||
import doublethink
|
||||
import cryptography.hazmat.backends.openssl
|
||||
import importlib
|
||||
import doublethink
|
||||
|
||||
class BetterArgumentDefaultsHelpFormatter(
|
||||
argparse.ArgumentDefaultsHelpFormatter,
|
||||
@ -58,7 +60,7 @@ class BetterArgumentDefaultsHelpFormatter(
|
||||
if isinstance(action, argparse._StoreConstAction):
|
||||
return action.help
|
||||
else:
|
||||
return super()._get_help_string(action)
|
||||
return argparse.ArgumentDefaultsHelpFormatter._get_help_string(self, action)
|
||||
|
||||
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser = argparse.ArgumentParser(prog=prog,
|
||||
@ -98,8 +100,18 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
default=False, help='write digests in Base32 instead of hex')
|
||||
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
|
||||
action='append', help='only record requests with the given http method(s) (can be used more than once)')
|
||||
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
|
||||
default='./warcprox.sqlite', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
|
||||
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--stats-db-file', dest='stats_db_file',
|
||||
default='./warcprox.sqlite', help=(
|
||||
'persistent statistics database file; empty string or '
|
||||
'/dev/null disables statistics tracking'))
|
||||
group.add_argument(
|
||||
'--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
|
||||
'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
|
||||
'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
|
||||
|
||||
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
|
||||
type=int, default=None, help='port to listen on for instant playback')
|
||||
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
|
||||
@ -108,18 +120,27 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
group.add_argument(
|
||||
'--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
|
||||
'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
|
||||
'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
|
||||
group.add_argument(
|
||||
'--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
|
||||
'rethinkdb big table url (table will be populated with '
|
||||
'various capture information and is suitable for use as '
|
||||
'index for playback), e.g. rethinkdb://db0.foo.org,'
|
||||
'db1.foo.org:38015/my_warcprox_db/captures'))
|
||||
group.add_argument(
|
||||
'--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
|
||||
'🐷 url pointing to trough configuration rethinkdb database, '
|
||||
'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
|
||||
'/trough_configuration'))
|
||||
group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup',
|
||||
help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search')
|
||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument('--rethinkdb-big-table',
|
||||
dest='rethinkdb_big_table', action='store_true', default=False,
|
||||
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-big-table-name', dest='rethinkdb_big_table_name',
|
||||
default='captures', help=argparse.SUPPRESS)
|
||||
'--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
|
||||
'rethinkdb service registry table url; if provided, warcprox '
|
||||
'will create and heartbeat entry for itself'))
|
||||
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
||||
default=500, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
||||
@ -186,30 +207,25 @@ def init_controller(args):
|
||||
exit(1)
|
||||
|
||||
listeners = []
|
||||
if args.rethinkdb_servers:
|
||||
rr = doublethink.Rethinker(
|
||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
if args.rethinkdb_big_table:
|
||||
captures_db = warcprox.bigtable.RethinkCaptures(
|
||||
rr, table=args.rethinkdb_big_table_name, options=options)
|
||||
dedup_db = warcprox.bigtable.RethinkCapturesDedup(
|
||||
captures_db, options=options)
|
||||
listeners.append(captures_db)
|
||||
else:
|
||||
dedup_db = warcprox.dedup.RethinkDedupDb(rr, options=options)
|
||||
listeners.append(dedup_db)
|
||||
|
||||
if args.rethinkdb_dedup_url:
|
||||
dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
|
||||
elif args.rethinkdb_big_table_url:
|
||||
dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
|
||||
elif args.rethinkdb_trough_db_url:
|
||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||
elif args.cdxserver_dedup:
|
||||
dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
|
||||
listeners.append(dedup_db)
|
||||
elif args.dedup_db_file in (None, '', '/dev/null'):
|
||||
logging.info('deduplication disabled')
|
||||
dedup_db = None
|
||||
else:
|
||||
dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
|
||||
if dedup_db:
|
||||
listeners.append(dedup_db)
|
||||
|
||||
if args.rethinkdb_servers:
|
||||
stats_db = warcprox.stats.RethinkStatsDb(rr, options=options)
|
||||
if args.rethinkdb_stats_url:
|
||||
stats_db = warcprox.stats.RethinkStatsDb(options=options)
|
||||
listeners.append(stats_db)
|
||||
elif args.stats_db_file in (None, '', '/dev/null'):
|
||||
logging.info('statistics tracking disabled')
|
||||
@ -263,8 +279,11 @@ def init_controller(args):
|
||||
listeners=listeners, options=options)
|
||||
for i in range(int(proxy.max_threads ** 0.5))]
|
||||
|
||||
if args.rethinkdb_servers:
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
if args.rethinkdb_services_url:
|
||||
parsed = doublethink.parse_rethinkdb_url(
|
||||
options.rethinkdb_services_url)
|
||||
rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
|
||||
svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
|
||||
else:
|
||||
svcreg = None
|
||||
|
||||
@ -298,8 +317,7 @@ def main(argv=sys.argv):
|
||||
loglevel = logging.INFO
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=loglevel,
|
||||
format=(
|
||||
stream=sys.stdout, level=loglevel, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
|
||||
@ -326,6 +344,7 @@ def ensure_rethinkdb_tables():
|
||||
tables. So it's a good idea to use this utility at an early step when
|
||||
spinning up a cluster.
|
||||
'''
|
||||
raise Exception('adjust my args')
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||
|
@ -32,6 +32,7 @@ import datetime
|
||||
import urlcanon
|
||||
import sqlite3
|
||||
import copy
|
||||
import doublethink
|
||||
|
||||
def _empty_bucket(bucket):
|
||||
return {
|
||||
@ -189,11 +190,12 @@ class RethinkStatsDb(StatsDb):
|
||||
"""Updates database in batch every 2.0 seconds"""
|
||||
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
||||
|
||||
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.rr = rethinker
|
||||
self.table = table
|
||||
self.shards = shards or 1 # 1 shard by default because it's probably a small table
|
||||
self.replicas = replicas or min(3, len(self.rr.servers))
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
|
||||
self.rr = doublethink.Rethinker(
|
||||
servers=parsed.hosts, db=parsed.database)
|
||||
self.table = parsed.table
|
||||
self.replicas = min(3, len(self.rr.servers))
|
||||
self._ensure_db_table()
|
||||
self.options = options
|
||||
|
||||
@ -271,10 +273,10 @@ class RethinkStatsDb(StatsDb):
|
||||
if not self.table in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table %r in database %r shards=%r "
|
||||
"replicas=%r", self.table, self.rr.dbname, self.shards,
|
||||
"replicas=%r", self.table, self.rr.dbname, 1,
|
||||
self.replicas)
|
||||
self.rr.table_create(
|
||||
self.table, primary_key="bucket", shards=self.shards,
|
||||
self.table, primary_key="bucket", shards=1,
|
||||
replicas=self.replicas).run()
|
||||
|
||||
def close(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user