2014-11-20 00:04:43 -08:00
|
|
|
|
#!/usr/bin/env python
|
2017-10-11 12:06:19 -07:00
|
|
|
|
# vim: set fileencoding=utf-8:
|
2016-05-10 01:11:17 -07:00
|
|
|
|
'''
|
|
|
|
|
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
|
|
|
|
arguments, initializes components, starts controller, handles signals
|
|
|
|
|
|
2018-01-24 16:07:45 -08:00
|
|
|
|
Copyright (C) 2013-2018 Internet Archive
|
2016-05-10 01:11:17 -07:00
|
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
|
|
|
modify it under the terms of the GNU General Public License
|
|
|
|
|
as published by the Free Software Foundation; either version 2
|
|
|
|
|
of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
|
|
|
USA.
|
|
|
|
|
'''
|
2014-11-20 00:04:43 -08:00
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import queue
|
|
|
|
|
except ImportError:
|
|
|
|
|
import Queue as queue
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
import sys
|
|
|
|
|
import hashlib
|
|
|
|
|
import argparse
|
|
|
|
|
import os
|
|
|
|
|
import socket
|
2015-07-24 20:46:23 +00:00
|
|
|
|
import traceback
|
|
|
|
|
import signal
|
|
|
|
|
import threading
|
2015-03-30 09:32:10 -07:00
|
|
|
|
import certauth.certauth
|
2015-07-30 00:12:59 +00:00
|
|
|
|
import warcprox
|
2017-03-02 15:06:26 -08:00
|
|
|
|
import doublethink
|
2016-09-23 15:54:31 +01:00
|
|
|
|
import cryptography.hazmat.backends.openssl
|
2014-11-20 00:04:43 -08:00
|
|
|
|
|
2017-05-10 18:01:56 +00:00
|
|
|
|
class BetterArgumentDefaultsHelpFormatter(
|
|
|
|
|
argparse.ArgumentDefaultsHelpFormatter,
|
|
|
|
|
argparse.RawDescriptionHelpFormatter):
|
|
|
|
|
'''
|
|
|
|
|
HelpFormatter with these properties:
|
|
|
|
|
|
|
|
|
|
- formats option help like argparse.ArgumentDefaultsHelpFormatter except
|
2017-05-19 16:10:44 -07:00
|
|
|
|
that it omits the default value for arguments with action='store_const'
|
2017-05-10 18:01:56 +00:00
|
|
|
|
- like argparse.RawDescriptionHelpFormatter, does not reformat description
|
|
|
|
|
string
|
|
|
|
|
'''
|
|
|
|
|
def _get_help_string(self, action):
|
|
|
|
|
if isinstance(action, argparse._StoreConstAction):
|
|
|
|
|
return action.help
|
|
|
|
|
else:
|
2017-10-13 15:54:05 -07:00
|
|
|
|
return argparse.ArgumentDefaultsHelpFormatter._get_help_string(self, action)
|
2017-05-10 18:01:56 +00:00
|
|
|
|
|
2018-01-15 17:15:36 -08:00
|
|
|
|
def _build_arg_parser(prog='warcprox'):
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser = argparse.ArgumentParser(prog=prog,
|
|
|
|
|
description='warcprox - WARC writing MITM HTTP/S proxy',
|
2017-05-10 18:01:56 +00:00
|
|
|
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
|
2015-08-24 23:53:11 +00:00
|
|
|
|
type=int, help='port to listen on')
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-b', '--address', dest='address',
|
|
|
|
|
default='localhost', help='address to listen on')
|
|
|
|
|
arg_parser.add_argument('-c', '--cacert', dest='cacert',
|
|
|
|
|
default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
|
|
|
|
|
help='CA certificate file; if file does not exist, it will be created')
|
|
|
|
|
arg_parser.add_argument('--certs-dir', dest='certs_dir',
|
|
|
|
|
default='./{0}-warcprox-ca'.format(socket.gethostname()),
|
|
|
|
|
help='where to store and load generated certificates')
|
|
|
|
|
arg_parser.add_argument('-d', '--dir', dest='directory',
|
|
|
|
|
default='./warcs', help='where to write warcs')
|
2018-01-08 12:13:05 +00:00
|
|
|
|
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
|
|
|
|
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
|
|
|
|
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
|
|
|
|
help='write gzip-compressed warc records')
|
2017-10-26 19:44:22 +00:00
|
|
|
|
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
|
|
|
|
default=False, action='store_true', help=argparse.SUPPRESS)
|
2017-12-21 14:33:30 -08:00
|
|
|
|
# not mentioned in --help: special value for '-' for --prefix means don't
|
|
|
|
|
# archive the capture, unless prefix set in warcprox-meta header
|
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'-n', '--prefix', dest='prefix', default='WARCPROX',
|
|
|
|
|
help='default WARC filename prefix')
|
2017-09-05 12:43:55 -07:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
|
|
|
|
|
type=int, help='WARC file rollover size threshold in bytes')
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('--rollover-idle-time',
|
2015-08-27 20:09:21 +00:00
|
|
|
|
dest='rollover_idle_time', default=None, type=int,
|
2014-11-20 00:04:43 -08:00
|
|
|
|
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
|
|
|
|
try:
|
|
|
|
|
hash_algos = hashlib.algorithms_guaranteed
|
|
|
|
|
except AttributeError:
|
|
|
|
|
hash_algos = hashlib.algorithms
|
|
|
|
|
arg_parser.add_argument('-g', '--digest-algorithm', dest='digest_algorithm',
|
|
|
|
|
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
|
|
|
|
|
arg_parser.add_argument('--base32', dest='base32', action='store_true',
|
|
|
|
|
default=False, help='write digests in Base32 instead of hex')
|
2016-11-15 23:24:59 +11:00
|
|
|
|
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
|
|
|
|
|
action='append', help='only record requests with the given http method(s) (can be used more than once)')
|
2017-10-11 12:06:19 -07:00
|
|
|
|
|
|
|
|
|
group = arg_parser.add_mutually_exclusive_group()
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--stats-db-file', dest='stats_db_file',
|
|
|
|
|
default='./warcprox.sqlite', help=(
|
|
|
|
|
'persistent statistics database file; empty string or '
|
|
|
|
|
'/dev/null disables statistics tracking'))
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
|
|
|
|
|
'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
|
|
|
|
|
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
|
2015-08-24 23:53:11 +00:00
|
|
|
|
type=int, default=None, help='port to listen on for instant playback')
|
2018-01-15 14:37:27 -08:00
|
|
|
|
# arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
|
|
|
|
|
# default='./warcprox-playback-index.db',
|
|
|
|
|
# help='playback index database file (only used if --playback-port is specified)')
|
2015-08-21 00:27:30 +00:00
|
|
|
|
group = arg_parser.add_mutually_exclusive_group()
|
|
|
|
|
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
2017-05-24 13:57:09 -07:00
|
|
|
|
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
2017-10-11 12:06:19 -07:00
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
|
|
|
|
|
'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
|
|
|
|
|
'rethinkdb big table url (table will be populated with '
|
|
|
|
|
'various capture information and is suitable for use as '
|
|
|
|
|
'index for playback), e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/captures'))
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
|
|
|
|
|
'🐷 url pointing to trough configuration rethinkdb database, '
|
|
|
|
|
'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
|
|
|
|
|
'/trough_configuration'))
|
2017-10-13 17:44:07 +00:00
|
|
|
|
group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup',
|
2017-10-20 20:00:02 +00:00
|
|
|
|
help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search')
|
2017-07-20 12:53:59 -07:00
|
|
|
|
arg_parser.add_argument(
|
2017-10-11 12:06:19 -07:00
|
|
|
|
'--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
|
|
|
|
|
'rethinkdb service registry table url; if provided, warcprox '
|
|
|
|
|
'will create and heartbeat entry for itself'))
|
2018-01-19 15:16:26 +00:00
|
|
|
|
# optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
|
|
|
|
|
arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
|
|
|
|
|
help=argparse.SUPPRESS)
|
2018-04-09 15:52:44 +00:00
|
|
|
|
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
|
|
|
|
type=int, default=0,
|
|
|
|
|
help=('try to dedup text resources with payload size over this limit in bytes'))
|
|
|
|
|
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
|
|
|
|
|
type=int, default=0, help=(
|
|
|
|
|
'try to dedup binary resources with payload size over this limit in bytes'))
|
2018-05-04 20:50:54 +00:00
|
|
|
|
# optionally, dedup request only when `dedup-bucket` is available in
|
|
|
|
|
# Warcprox-Meta HTTP header. By default, we dedup all requests.
|
|
|
|
|
arg_parser.add_argument('--dedup-only-with-bucket', dest='dedup_only_with_bucket',
|
|
|
|
|
action='store_true', default=False, help=argparse.SUPPRESS)
|
2017-04-11 16:29:57 -07:00
|
|
|
|
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
|
|
|
|
default=500, help=argparse.SUPPRESS)
|
|
|
|
|
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
2015-11-05 02:26:43 +00:00
|
|
|
|
help=argparse.SUPPRESS)
|
2017-05-11 11:56:01 -07:00
|
|
|
|
arg_parser.add_argument('--profile', action='store_true', default=False,
|
2015-11-06 22:30:14 +00:00
|
|
|
|
help=argparse.SUPPRESS)
|
2017-11-14 16:35:25 -08:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--writer-threads', dest='writer_threads', type=int, default=None,
|
|
|
|
|
help=argparse.SUPPRESS)
|
2017-06-28 12:53:34 -07:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
|
|
|
|
default=None, help=(
|
|
|
|
|
'host:port of tor socks proxy, used only to connect to '
|
|
|
|
|
'.onion sites'))
|
2018-01-30 07:03:58 +00:00
|
|
|
|
# Configurable connection socket timeout, default is 60 sec.
|
2018-01-27 15:38:44 +00:00
|
|
|
|
arg_parser.add_argument(
|
2018-01-30 07:03:58 +00:00
|
|
|
|
'--socket-timeout', dest='socket_timeout', type=float,
|
2018-01-27 15:38:44 +00:00
|
|
|
|
default=None, help=argparse.SUPPRESS)
|
2018-03-07 08:00:18 +00:00
|
|
|
|
# Increasing this value increases memory usage but reduces /tmp disk I/O.
|
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--tmp-file-max-memory-size', dest='tmp_file_max_memory_size',
|
|
|
|
|
type=int, default=512*1024, help=argparse.SUPPRESS)
|
2018-02-09 14:48:11 +00:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--max-resource-size', dest='max_resource_size', type=int,
|
2018-02-10 21:30:56 +00:00
|
|
|
|
default=None, help='maximum resource size limit in bytes')
|
2017-08-07 13:07:54 -07:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
|
|
|
|
|
'if specified, write crawl log files in the specified '
|
|
|
|
|
'directory; one crawl log is written per warc filename '
|
|
|
|
|
'prefix; crawl log format mimics heritrix'))
|
2017-06-28 12:53:34 -07:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'--plugin', metavar='PLUGIN_CLASS', dest='plugins',
|
|
|
|
|
action='append', help=(
|
|
|
|
|
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
|
|
|
|
'May be used multiple times to register multiple plugins. '
|
2018-05-21 22:18:28 +00:00
|
|
|
|
'See readme.rst for more information.'))
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('--version', action='version',
|
2015-09-24 00:19:32 +00:00
|
|
|
|
version="warcprox {}".format(warcprox.__version__))
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
2016-05-10 23:11:47 +00:00
|
|
|
|
arg_parser.add_argument('--trace', dest='trace', action='store_true')
|
2014-11-20 00:04:43 -08:00
|
|
|
|
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
|
|
|
|
|
|
|
|
|
return arg_parser
|
|
|
|
|
|
2015-07-24 20:46:23 +00:00
|
|
|
|
def dump_state(signum=None, frame=None):
|
2016-05-10 01:11:17 -07:00
|
|
|
|
'''
|
|
|
|
|
Signal handler, logs stack traces of active threads.
|
|
|
|
|
'''
|
2015-07-24 20:46:23 +00:00
|
|
|
|
state_strs = []
|
|
|
|
|
|
|
|
|
|
for th in threading.enumerate():
|
2015-10-31 01:17:45 +00:00
|
|
|
|
try:
|
|
|
|
|
state_strs.append(str(th))
|
2017-06-12 16:51:50 -07:00
|
|
|
|
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
|
|
|
|
state_strs.append(''.join(stack))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
state_strs.append('<n/a:%r>' % e)
|
2015-07-24 20:46:23 +00:00
|
|
|
|
|
2016-07-05 11:51:56 -05:00
|
|
|
|
logging.warn(
|
|
|
|
|
'dumping state (caught signal %s)\n%s',
|
|
|
|
|
signum, '\n'.join(state_strs))
|
2015-07-24 20:46:23 +00:00
|
|
|
|
|
2018-01-12 14:58:26 -08:00
|
|
|
|
def parse_args(argv):
|
2016-05-10 01:11:17 -07:00
|
|
|
|
'''
|
2018-01-12 14:58:26 -08:00
|
|
|
|
Parses command line arguments with argparse.
|
2016-05-10 01:11:17 -07:00
|
|
|
|
'''
|
2018-01-12 14:58:26 -08:00
|
|
|
|
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
|
|
|
|
|
args = arg_parser.parse_args(args=argv[1:])
|
2014-11-20 00:04:43 -08:00
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
hashlib.new(args.digest_algorithm)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.fatal(e)
|
|
|
|
|
exit(1)
|
|
|
|
|
|
2015-10-28 21:34:34 +00:00
|
|
|
|
return args
|
|
|
|
|
|
2017-11-28 10:38:38 -08:00
|
|
|
|
def main(argv=None):
|
2016-05-10 01:11:17 -07:00
|
|
|
|
'''
|
|
|
|
|
Main method, entry point of warcprox command.
|
|
|
|
|
'''
|
2017-11-28 10:38:38 -08:00
|
|
|
|
args = parse_args(argv or sys.argv)
|
2015-10-28 21:34:34 +00:00
|
|
|
|
|
2016-05-10 23:11:47 +00:00
|
|
|
|
if args.trace:
|
|
|
|
|
loglevel = warcprox.TRACE
|
|
|
|
|
elif args.verbose:
|
2015-10-28 21:34:34 +00:00
|
|
|
|
loglevel = logging.DEBUG
|
|
|
|
|
elif args.quiet:
|
|
|
|
|
loglevel = logging.WARNING
|
|
|
|
|
else:
|
|
|
|
|
loglevel = logging.INFO
|
|
|
|
|
|
2017-03-27 22:42:46 +00:00
|
|
|
|
logging.basicConfig(
|
2017-10-11 12:06:19 -07:00
|
|
|
|
stream=sys.stdout, level=loglevel, format=(
|
2017-03-27 22:42:46 +00:00
|
|
|
|
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
|
|
|
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
2015-10-28 21:34:34 +00:00
|
|
|
|
|
2017-08-03 15:19:57 -07:00
|
|
|
|
# see https://github.com/pyca/cryptography/issues/2911
|
|
|
|
|
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
|
|
|
|
|
2018-01-12 14:58:26 -08:00
|
|
|
|
options = warcprox.Options(**vars(args))
|
|
|
|
|
controller = warcprox.controller.WarcproxController(options)
|
2017-08-03 15:19:57 -07:00
|
|
|
|
|
|
|
|
|
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
|
|
|
|
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
2017-09-07 12:28:07 -07:00
|
|
|
|
try:
|
|
|
|
|
signal.signal(signal.SIGQUIT, dump_state)
|
|
|
|
|
except AttributeError:
|
|
|
|
|
# SIGQUIT does not exist on some platforms (windows)
|
|
|
|
|
pass
|
2017-08-03 15:19:57 -07:00
|
|
|
|
|
|
|
|
|
controller.run_until_shutdown()
|
2014-11-20 00:04:43 -08:00
|
|
|
|
|
2017-11-28 10:38:38 -08:00
|
|
|
|
def ensure_rethinkdb_tables(argv=None):
|
2016-06-30 15:24:40 -05:00
|
|
|
|
'''
|
|
|
|
|
Creates rethinkdb tables if they don't already exist. Warcprox normally
|
|
|
|
|
creates the tables it needs on demand at startup, but if multiple instances
|
|
|
|
|
are starting up at the same time, you can end up with duplicate broken
|
|
|
|
|
tables. So it's a good idea to use this utility at an early step when
|
|
|
|
|
spinning up a cluster.
|
|
|
|
|
'''
|
2017-11-28 10:38:38 -08:00
|
|
|
|
argv = argv or sys.argv
|
2016-06-30 15:24:40 -05:00
|
|
|
|
arg_parser = argparse.ArgumentParser(
|
2017-11-28 10:38:38 -08:00
|
|
|
|
prog=os.path.basename(argv[0]),
|
2017-05-10 18:01:56 +00:00
|
|
|
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
2016-06-30 15:24:40 -05:00
|
|
|
|
arg_parser.add_argument(
|
2017-11-28 10:38:38 -08:00
|
|
|
|
'--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
|
|
|
|
|
'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
|
|
|
|
|
group = arg_parser.add_mutually_exclusive_group()
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
|
|
|
|
|
'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
|
|
|
|
|
'rethinkdb big table url (table will be populated with '
|
|
|
|
|
'various capture information and is suitable for use as '
|
|
|
|
|
'index for playback), e.g. rethinkdb://db0.foo.org,'
|
|
|
|
|
'db1.foo.org:38015/my_warcprox_db/captures'))
|
|
|
|
|
group.add_argument(
|
|
|
|
|
'--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
|
|
|
|
|
'🐷 url pointing to trough configuration rethinkdb database, '
|
|
|
|
|
'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
|
|
|
|
|
'/trough_configuration'))
|
2016-06-30 15:24:40 -05:00
|
|
|
|
arg_parser.add_argument(
|
2017-11-28 10:38:38 -08:00
|
|
|
|
'--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
|
|
|
|
|
'rethinkdb service registry table url; if provided, warcprox '
|
|
|
|
|
'will create and heartbeat entry for itself'))
|
2016-06-30 15:24:40 -05:00
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'-q', '--quiet', dest='log_level',
|
|
|
|
|
action='store_const', default=logging.INFO, const=logging.WARN)
|
|
|
|
|
arg_parser.add_argument(
|
|
|
|
|
'-v', '--verbose', dest='log_level',
|
|
|
|
|
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
2017-11-28 10:38:38 -08:00
|
|
|
|
args = arg_parser.parse_args(args=argv[1:])
|
2016-06-30 15:24:40 -05:00
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
2017-11-28 10:38:38 -08:00
|
|
|
|
stream=sys.stdout, level=args.log_level, format=(
|
2016-06-30 15:24:40 -05:00
|
|
|
|
'%(asctime)s %(levelname)s %(name)s.%(funcName)s'
|
|
|
|
|
'(%(filename)s:%(lineno)d) %(message)s'))
|
|
|
|
|
|
2017-11-28 10:38:38 -08:00
|
|
|
|
options = warcprox.Options(**vars(args))
|
2016-06-30 15:24:40 -05:00
|
|
|
|
|
2017-11-28 10:38:38 -08:00
|
|
|
|
did_something = False
|
|
|
|
|
if args.rethinkdb_services_url:
|
|
|
|
|
parsed = doublethink.parse_rethinkdb_url(
|
|
|
|
|
options.rethinkdb_services_url)
|
|
|
|
|
rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
|
|
|
|
|
svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
|
|
|
|
|
did_something = True
|
|
|
|
|
if args.rethinkdb_stats_url:
|
2018-01-17 15:33:41 -08:00
|
|
|
|
stats_db = warcprox.stats.RethinkStatsProcessor(options=options)
|
|
|
|
|
stats_db._ensure_db_table()
|
2017-11-28 10:38:38 -08:00
|
|
|
|
did_something = True
|
|
|
|
|
if args.rethinkdb_dedup_url:
|
|
|
|
|
dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
|
|
|
|
|
did_something = True
|
|
|
|
|
if args.rethinkdb_big_table_url:
|
|
|
|
|
dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
|
|
|
|
|
did_something = True
|
|
|
|
|
if args.rethinkdb_trough_db_url:
|
|
|
|
|
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
|
|
|
|
logging.warn(
|
2018-01-17 15:33:41 -08:00
|
|
|
|
'trough is responsible for creating most of the rethinkdb '
|
2017-11-28 10:38:38 -08:00
|
|
|
|
'tables that it uses')
|
|
|
|
|
did_something = True
|
2016-06-30 15:24:40 -05:00
|
|
|
|
|
2017-11-28 10:38:38 -08:00
|
|
|
|
if not did_something:
|
|
|
|
|
logging.error('nothing to do, no --rethinkdb-* options supplied')
|
2016-06-30 15:24:40 -05:00
|
|
|
|
|
2014-11-20 00:04:43 -08:00
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|
|
|
|
|
|