mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix benchmarks (update command line args)
This commit is contained in:
parent
fd7dbaf1cb
commit
e538637b65
@ -163,78 +163,87 @@ Benchmarking code uses asyncio/aiohttp and requires python 3.5 or later.
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=prog, description=desc,
|
||||
formatter_class=warcprox.main.BetterArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument(
|
||||
'-z', '--gzip', dest='gzip', action='store_true',
|
||||
|
||||
### these warcprox options are not configurable for the benchmarks
|
||||
# arg_parser.add_argument('-p', '--port', dest='port', default='8000',
|
||||
# type=int, help='port to listen on')
|
||||
# arg_parser.add_argument('-b', '--address', dest='address',
|
||||
# default='localhost', help='address to listen on')
|
||||
# arg_parser.add_argument('-c', '--cacert', dest='cacert',
|
||||
# default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
|
||||
# help='CA certificate file; if file does not exist, it will be created')
|
||||
# arg_parser.add_argument('--certs-dir', dest='certs_dir',
|
||||
# default='./{0}-warcprox-ca'.format(socket.gethostname()),
|
||||
# help='where to store and load generated certificates')
|
||||
# arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||
# default='./warcs', help='where to write warcs')
|
||||
|
||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||
help='write gzip-compressed warc records')
|
||||
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
||||
default='WARCPROX', help='WARC filename prefix')
|
||||
arg_parser.add_argument(
|
||||
'-s', '--size', dest='size', default=1000*1000*1000, type=int,
|
||||
help='WARC file rollover size threshold in bytes')
|
||||
arg_parser.add_argument(
|
||||
'--rollover-idle-time', dest='rollover_idle_time', default=None,
|
||||
type=int, help=(
|
||||
'WARC file rollover idle time threshold in seconds (so that '
|
||||
"Friday's last open WARC doesn't sit there all weekend "
|
||||
'waiting for more data)'))
|
||||
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
|
||||
type=int, help='WARC file rollover size threshold in bytes')
|
||||
arg_parser.add_argument('--rollover-idle-time',
|
||||
dest='rollover_idle_time', default=None, type=int,
|
||||
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
||||
try:
|
||||
hash_algos = hashlib.algorithms_guaranteed
|
||||
except AttributeError:
|
||||
hash_algos = hashlib.algorithms
|
||||
arg_parser.add_argument(
|
||||
'-g', '--digest-algorithm', dest='digest_algorithm',
|
||||
default='sha1', help='digest algorithm, one of %s' % hash_algos)
|
||||
arg_parser.add_argument('-g', '--digest-algorithm', dest='digest_algorithm',
|
||||
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
|
||||
arg_parser.add_argument('--base32', dest='base32', action='store_true',
|
||||
default=False, help='write digests in Base32 instead of hex')
|
||||
arg_parser.add_argument(
|
||||
'--method-filter', metavar='HTTP_METHOD',
|
||||
action='append', help=(
|
||||
'only record requests with the given http method(s) (can be '
|
||||
'used more than once)'))
|
||||
arg_parser.add_argument(
|
||||
'--stats-db-file', dest='stats_db_file',
|
||||
default=os.path.join(tmpdir, 'stats.db'), help=(
|
||||
'persistent statistics database file; empty string or '
|
||||
'/dev/null disables statistics tracking'))
|
||||
arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
|
||||
action='append', help='only record requests with the given http method(s) (can be used more than once)')
|
||||
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
|
||||
default='./warcprox.sqlite', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
|
||||
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
|
||||
type=int, default=None, help='port to listen on for instant playback')
|
||||
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
|
||||
default='./warcprox-playback-index.db',
|
||||
help='playback index database file (only used if --playback-port is specified)')
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default=os.path.join(tmpdir, 'dedup.db'), help=(
|
||||
'persistent deduplication database file; empty string or '
|
||||
'/dev/null disables deduplication'))
|
||||
group.add_argument(
|
||||
'--rethinkdb-servers', dest='rethinkdb_servers', help=(
|
||||
'rethinkdb servers, used for dedup and stats if specified; '
|
||||
'e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org'))
|
||||
# arg_parser.add_argument(
|
||||
# '--rethinkdb-db', dest='rethinkdb_db', default='warcprox', help=(
|
||||
# 'rethinkdb database name (ignored unless --rethinkdb-servers '
|
||||
# 'is specified)'))
|
||||
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument('--rethinkdb-big-table',
|
||||
dest='rethinkdb_big_table', action='store_true', default=False,
|
||||
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-big-table', dest='rethinkdb_big_table',
|
||||
action='store_true', default=False, help=(
|
||||
'use a big rethinkdb table called "captures", instead of a '
|
||||
'small table called "dedup"; table is suitable for use as '
|
||||
'index for playback (ignored unless --rethinkdb-servers is '
|
||||
'specified)'))
|
||||
'--rethinkdb-big-table-name', dest='rethinkdb_big_table_name',
|
||||
default='captures', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
||||
default=500, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--profile', action='store_true', default=False,
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--queue-size', dest='queue_size', type=int, default=1, help=(
|
||||
'max size of the queue of urls waiting to be processed by '
|
||||
'the warc writer thread'))
|
||||
'--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
||||
default=None, help=(
|
||||
'host:port of tor socks proxy, used only to connect to '
|
||||
'.onion sites'))
|
||||
arg_parser.add_argument(
|
||||
'--max-threads', dest='max_threads', type=int, help=(
|
||||
'number of proxy server threads (if not specified, chosen based '
|
||||
'on system resource limits'))
|
||||
arg_parser.add_argument(
|
||||
'--version', action='version',
|
||||
version='warcprox %s' % warcprox.__version__)
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true',
|
||||
help='verbose logging')
|
||||
arg_parser.add_argument(
|
||||
'--trace', dest='trace', action='store_true',
|
||||
help='trace-level logging')
|
||||
arg_parser.add_argument(
|
||||
'--profile', dest='profile', action='store_true', default=False,
|
||||
help='profile the warc writer thread')
|
||||
'--plugin', metavar='PLUGIN_CLASS', dest='plugins',
|
||||
action='append', help=(
|
||||
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
||||
'May be used multiple times to register multiple plugins. '
|
||||
'Plugin classes are loaded from the regular python module '
|
||||
'search path. They will be instantiated with no arguments and '
|
||||
'must have a method `notify(self, recorded_url, records)` '
|
||||
'which will be called for each url, after warc records have '
|
||||
'been written.'))
|
||||
arg_parser.add_argument('--version', action='version',
|
||||
version="warcprox {}".format(warcprox.__version__))
|
||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('--trace', dest='trace', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
arg_parser.add_argument(
|
||||
'--requests', dest='requests', type=int, default=200,
|
||||
help='number of urls to fetch')
|
||||
|
Loading…
x
Reference in New Issue
Block a user