fix benchmarks (update command line args)

This commit is contained in:
Noah Levitt 2017-10-23 12:49:32 -07:00
parent fd7dbaf1cb
commit e538637b65

View File

@ -163,78 +163,87 @@ Benchmarking code uses asyncio/aiohttp and requires python 3.5 or later.
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=prog, description=desc, prog=prog, description=desc,
formatter_class=warcprox.main.BetterArgumentDefaultsHelpFormatter) formatter_class=warcprox.main.BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'-z', '--gzip', dest='gzip', action='store_true', ### these warcprox options are not configurable for the benchmarks
# arg_parser.add_argument('-p', '--port', dest='port', default='8000',
# type=int, help='port to listen on')
# arg_parser.add_argument('-b', '--address', dest='address',
# default='localhost', help='address to listen on')
# arg_parser.add_argument('-c', '--cacert', dest='cacert',
# default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
# help='CA certificate file; if file does not exist, it will be created')
# arg_parser.add_argument('--certs-dir', dest='certs_dir',
# default='./{0}-warcprox-ca'.format(socket.gethostname()),
# help='where to store and load generated certificates')
# arg_parser.add_argument('-d', '--dir', dest='directory',
# default='./warcs', help='where to write warcs')
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
help='write gzip-compressed warc records') help='write gzip-compressed warc records')
arg_parser.add_argument('-n', '--prefix', dest='prefix',
default='WARCPROX', help='WARC filename prefix')
arg_parser.add_argument( arg_parser.add_argument(
'-s', '--size', dest='size', default=1000*1000*1000, type=int, '-s', '--size', dest='rollover_size', default=1000*1000*1000,
help='WARC file rollover size threshold in bytes') type=int, help='WARC file rollover size threshold in bytes')
arg_parser.add_argument( arg_parser.add_argument('--rollover-idle-time',
'--rollover-idle-time', dest='rollover_idle_time', default=None, dest='rollover_idle_time', default=None, type=int,
type=int, help=( help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
'WARC file rollover idle time threshold in seconds (so that '
"Friday's last open WARC doesn't sit there all weekend "
'waiting for more data)'))
try: try:
hash_algos = hashlib.algorithms_guaranteed hash_algos = hashlib.algorithms_guaranteed
except AttributeError: except AttributeError:
hash_algos = hashlib.algorithms hash_algos = hashlib.algorithms
arg_parser.add_argument( arg_parser.add_argument('-g', '--digest-algorithm', dest='digest_algorithm',
'-g', '--digest-algorithm', dest='digest_algorithm', default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
default='sha1', help='digest algorithm, one of %s' % hash_algos)
arg_parser.add_argument('--base32', dest='base32', action='store_true', arg_parser.add_argument('--base32', dest='base32', action='store_true',
default=False, help='write digests in Base32 instead of hex') default=False, help='write digests in Base32 instead of hex')
arg_parser.add_argument( arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
'--method-filter', metavar='HTTP_METHOD', action='append', help='only record requests with the given http method(s) (can be used more than once)')
action='append', help=( arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
'only record requests with the given http method(s) (can be ' default='./warcprox.sqlite', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
'used more than once)')) arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
arg_parser.add_argument( type=int, default=None, help='port to listen on for instant playback')
'--stats-db-file', dest='stats_db_file', arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
default=os.path.join(tmpdir, 'stats.db'), help=( default='./warcprox-playback-index.db',
'persistent statistics database file; empty string or ' help='playback index database file (only used if --playback-port is specified)')
'/dev/null disables statistics tracking'))
group = arg_parser.add_mutually_exclusive_group() group = arg_parser.add_mutually_exclusive_group()
group.add_argument( group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
'-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
default=os.path.join(tmpdir, 'dedup.db'), help=( group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
'persistent deduplication database file; empty string or ' help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
'/dev/null disables deduplication')) arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
group.add_argument( help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
'--rethinkdb-servers', dest='rethinkdb_servers', help=( arg_parser.add_argument('--rethinkdb-big-table',
'rethinkdb servers, used for dedup and stats if specified; ' dest='rethinkdb_big_table', action='store_true', default=False,
'e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')) help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
# arg_parser.add_argument(
# '--rethinkdb-db', dest='rethinkdb_db', default='warcprox', help=(
# 'rethinkdb database name (ignored unless --rethinkdb-servers '
# 'is specified)'))
arg_parser.add_argument( arg_parser.add_argument(
'--rethinkdb-big-table', dest='rethinkdb_big_table', '--rethinkdb-big-table-name', dest='rethinkdb_big_table_name',
action='store_true', default=False, help=( default='captures', help=argparse.SUPPRESS)
'use a big rethinkdb table called "captures", instead of a ' arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
'small table called "dedup"; table is suitable for use as ' default=500, help=argparse.SUPPRESS)
'index for playback (ignored unless --rethinkdb-servers is ' arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
'specified)')) help=argparse.SUPPRESS)
arg_parser.add_argument('--profile', action='store_true', default=False,
help=argparse.SUPPRESS)
arg_parser.add_argument( arg_parser.add_argument(
'--queue-size', dest='queue_size', type=int, default=1, help=( '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
'max size of the queue of urls waiting to be processed by ' default=None, help=(
'the warc writer thread')) 'host:port of tor socks proxy, used only to connect to '
'.onion sites'))
arg_parser.add_argument( arg_parser.add_argument(
'--max-threads', dest='max_threads', type=int, help=( '--plugin', metavar='PLUGIN_CLASS', dest='plugins',
'number of proxy server threads (if not specified, chosen based ' action='append', help=(
'on system resource limits')) 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
arg_parser.add_argument( 'May be used multiple times to register multiple plugins. '
'--version', action='version', 'Plugin classes are loaded from the regular python module '
version='warcprox %s' % warcprox.__version__) 'search path. They will be instantiated with no arguments and '
arg_parser.add_argument( 'must have a method `notify(self, recorded_url, records)` '
'-v', '--verbose', dest='verbose', action='store_true', 'which will be called for each url, after warc records have '
help='verbose logging') 'been written.'))
arg_parser.add_argument( arg_parser.add_argument('--version', action='version',
'--trace', dest='trace', action='store_true', version="warcprox {}".format(warcprox.__version__))
help='trace-level logging') arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
arg_parser.add_argument( arg_parser.add_argument('--trace', dest='trace', action='store_true')
'--profile', dest='profile', action='store_true', default=False, arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
help='profile the warc writer thread')
arg_parser.add_argument( arg_parser.add_argument(
'--requests', dest='requests', type=int, default=200, '--requests', dest='requests', type=int, default=200,
help='number of urls to fetch') help='number of urls to fetch')