mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
make plugin api more flexible
This commit is contained in:
parent
fd3008c727
commit
824c194142
48
README.rst
48
README.rst
@ -34,20 +34,34 @@ get the warning when you visit each new site. But worse, any embedded
|
|||||||
https content on a different server will simply fail to load, because
|
https content on a different server will simply fail to load, because
|
||||||
the browser will reject the certificate without telling you.
|
the browser will reject the certificate without telling you.
|
||||||
|
|
||||||
|
Plugins
|
||||||
|
~~~~~~~
|
||||||
|
|
||||||
|
Warcprox supports a limited notion of plugins by way of the `--plugin` command
|
||||||
|
line argument. Plugin classes are loaded from the regular python module search
|
||||||
|
path. They will be instantiated with one argument, a `warcprox.Options`, which
|
||||||
|
holds the values of all the command line arguments. Legacy plugins with
|
||||||
|
constructors that take no arguments are also supported. Plugins should either
|
||||||
|
have a method `notify(self, recorded_url, records)` or should subclass
|
||||||
|
`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
|
||||||
|
specifying `--plugin` multiples times.
|
||||||
|
|
||||||
|
XXX example?
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
~~~~~
|
~~~~~
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||||
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
[--certs-dir CERTS_DIR] [-d DIRECTORY]
|
||||||
|
[--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
|
||||||
[-s ROLLOVER_SIZE]
|
[-s ROLLOVER_SIZE]
|
||||||
[--rollover-idle-time ROLLOVER_IDLE_TIME]
|
[--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||||
[-g DIGEST_ALGORITHM] [--base32]
|
[-g DIGEST_ALGORITHM] [--base32]
|
||||||
[--method-filter HTTP_METHOD]
|
[--method-filter HTTP_METHOD]
|
||||||
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
|
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
|
||||||
[-P PLAYBACK_PORT]
|
[-P PLAYBACK_PORT]
|
||||||
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
|
|
||||||
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
|
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
|
||||||
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
|
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
|
||||||
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
||||||
@ -63,13 +77,19 @@ Usage
|
|||||||
address to listen on (default: localhost)
|
address to listen on (default: localhost)
|
||||||
-c CACERT, --cacert CACERT
|
-c CACERT, --cacert CACERT
|
||||||
CA certificate file; if file does not exist, it
|
CA certificate file; if file does not exist, it
|
||||||
will be created (default: ./ayutla.local-warcprox-
|
will be created (default:
|
||||||
ca.pem)
|
./ayutla.monkeybrains.net-warcprox-ca.pem)
|
||||||
--certs-dir CERTS_DIR
|
--certs-dir CERTS_DIR
|
||||||
where to store and load generated certificates
|
where to store and load generated certificates
|
||||||
(default: ./ayutla.local-warcprox-ca)
|
(default: ./ayutla.monkeybrains.net-warcprox-ca)
|
||||||
-d DIRECTORY, --dir DIRECTORY
|
-d DIRECTORY, --dir DIRECTORY
|
||||||
where to write warcs (default: ./warcs)
|
where to write warcs (default: ./warcs)
|
||||||
|
--warc-filename WARC_FILENAME
|
||||||
|
define custom WARC filename with variables
|
||||||
|
{prefix}, {timestamp14}, {timestamp17},
|
||||||
|
{serialno}, {randomtoken}, {hostname},
|
||||||
|
{shorthostname} (default:
|
||||||
|
{prefix}-{timestamp17}-{serialno}-{randomtoken})
|
||||||
-z, --gzip write gzip-compressed warc records
|
-z, --gzip write gzip-compressed warc records
|
||||||
-n PREFIX, --prefix PREFIX
|
-n PREFIX, --prefix PREFIX
|
||||||
default WARC filename prefix (default: WARCPROX)
|
default WARC filename prefix (default: WARCPROX)
|
||||||
@ -81,8 +101,8 @@ Usage
|
|||||||
(so that Friday's last open WARC doesn't sit there
|
(so that Friday's last open WARC doesn't sit there
|
||||||
all weekend waiting for more data) (default: None)
|
all weekend waiting for more data) (default: None)
|
||||||
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
||||||
digest algorithm, one of sha256, sha224, sha512,
|
digest algorithm, one of sha384, sha224, md5,
|
||||||
sha384, md5, sha1 (default: sha1)
|
sha256, sha512, sha1 (default: sha1)
|
||||||
--base32 write digests in Base32 instead of hex
|
--base32 write digests in Base32 instead of hex
|
||||||
--method-filter HTTP_METHOD
|
--method-filter HTTP_METHOD
|
||||||
only record requests with the given http method(s)
|
only record requests with the given http method(s)
|
||||||
@ -98,10 +118,6 @@ Usage
|
|||||||
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
||||||
port to listen on for instant playback (default:
|
port to listen on for instant playback (default:
|
||||||
None)
|
None)
|
||||||
--playback-index-db-file PLAYBACK_INDEX_DB_FILE
|
|
||||||
playback index database file (only used if
|
|
||||||
--playback-port is specified) (default:
|
|
||||||
./warcprox-playback-index.db)
|
|
||||||
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
||||||
persistent deduplication database file; empty
|
persistent deduplication database file; empty
|
||||||
string or /dev/null disables deduplication
|
string or /dev/null disables deduplication
|
||||||
@ -138,12 +154,8 @@ Usage
|
|||||||
--plugin PLUGIN_CLASS
|
--plugin PLUGIN_CLASS
|
||||||
Qualified name of plugin class, e.g.
|
Qualified name of plugin class, e.g.
|
||||||
"mypkg.mymod.MyClass". May be used multiple times
|
"mypkg.mymod.MyClass". May be used multiple times
|
||||||
to register multiple plugins. Plugin classes are
|
to register multiple plugins. See README.rst for
|
||||||
loaded from the regular python module search path.
|
more information. (default: None)
|
||||||
They will be instantiated with no arguments and
|
|
||||||
must have a method `notify(self, recorded_url,
|
|
||||||
records)` which will be called for each url, after
|
|
||||||
warc records have been written. (default: None)
|
|
||||||
--version show program's version number and exit
|
--version show program's version number and exit
|
||||||
-v, --verbose
|
-v, --verbose
|
||||||
--trace
|
--trace
|
||||||
@ -156,7 +168,7 @@ Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
|||||||
GPL.
|
GPL.
|
||||||
|
|
||||||
* Copyright (C) 2012 Cygnos Corporation
|
* Copyright (C) 2012 Cygnos Corporation
|
||||||
* Copyright (C) 2013-2017 Internet Archive
|
* Copyright (C) 2013-2018 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
|
4
setup.py
4
setup.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
setup.py - setuptools installation configuration for warcprox
|
setup.py - setuptools installation configuration for warcprox
|
||||||
|
|
||||||
Copyright (C) 2013-2017 Internet Archive
|
Copyright (C) 2013-2016 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -52,7 +52,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b1.dev143',
|
version='2.4b1.dev144',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
'''
|
'''
|
||||||
tests/test_warcprox.py - automated tests for warcprox
|
tests/test_warcprox.py - automated tests for warcprox
|
||||||
|
|
||||||
Copyright (C) 2013-2017 Internet Archive
|
Copyright (C) 2013-2018 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -1396,7 +1396,10 @@ def test_controller_with_defaults():
|
|||||||
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||||
|
|
||||||
def test_load_plugin():
|
def test_load_plugin():
|
||||||
options = warcprox.Options(port=0, plugins=['warcprox.stats.RunningStats'])
|
options = warcprox.Options(port=0, plugins=[
|
||||||
|
'warcprox.stats.RunningStats',
|
||||||
|
'warcprox.BaseStandardPostfetchProcessor',
|
||||||
|
'warcprox.BaseBatchPostfetchProcessor',])
|
||||||
controller = warcprox.controller.WarcproxController(options)
|
controller = warcprox.controller.WarcproxController(options)
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-1],
|
controller._postfetch_chain[-1],
|
||||||
@ -1404,11 +1407,18 @@ def test_load_plugin():
|
|||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-1].listener,
|
controller._postfetch_chain[-1].listener,
|
||||||
warcprox.stats.RunningStats)
|
warcprox.stats.RunningStats)
|
||||||
|
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-2],
|
controller._postfetch_chain[-2],
|
||||||
|
warcprox.BaseBatchPostfetchProcessor)
|
||||||
|
assert isinstance(
|
||||||
|
controller._postfetch_chain[-3],
|
||||||
|
warcprox.BaseStandardPostfetchProcessor)
|
||||||
|
assert isinstance(
|
||||||
|
controller._postfetch_chain[-4],
|
||||||
warcprox.ListenerPostfetchProcessor)
|
warcprox.ListenerPostfetchProcessor)
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-2].listener,
|
controller._postfetch_chain[-4].listener,
|
||||||
warcprox.stats.RunningStats)
|
warcprox.stats.RunningStats)
|
||||||
|
|
||||||
def test_choose_a_port_for_me(warcprox_):
|
def test_choose_a_port_for_me(warcprox_):
|
||||||
|
@ -92,13 +92,17 @@ class Factory:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def plugin(qualname):
|
def plugin(qualname, options):
|
||||||
try:
|
try:
|
||||||
(module_name, class_name) = qualname.rsplit('.', 1)
|
(module_name, class_name) = qualname.rsplit('.', 1)
|
||||||
module_ = importlib.import_module(module_name)
|
module_ = importlib.import_module(module_name)
|
||||||
class_ = getattr(module_, class_name)
|
class_ = getattr(module_, class_name)
|
||||||
plugin = class_()
|
try: # new plugins take `options` argument
|
||||||
plugin.notify # make sure it has this method
|
plugin = class_(options)
|
||||||
|
except: # backward-compatibility
|
||||||
|
plugin = class_()
|
||||||
|
# check that this is either a listener or a batch processor
|
||||||
|
assert hasattr(plugin, 'notify') ^ hasattr(plugin, '_startup')
|
||||||
return plugin
|
return plugin
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
logging.fatal('problem with plugin class %r: %s', qualname, e)
|
||||||
@ -197,15 +201,19 @@ class WarcproxController(object):
|
|||||||
warcprox.ListenerPostfetchProcessor(
|
warcprox.ListenerPostfetchProcessor(
|
||||||
crawl_logger, self.options))
|
crawl_logger, self.options))
|
||||||
|
|
||||||
|
for qualname in self.options.plugins or []:
|
||||||
|
plugin = Factory.plugin(qualname, self.options)
|
||||||
|
if hasattr(plugin, 'notify'):
|
||||||
|
self._postfetch_chain.append(
|
||||||
|
warcprox.ListenerPostfetchProcessor(
|
||||||
|
plugin, self.options))
|
||||||
|
else:
|
||||||
|
self._postfetch_chain.append(plugin)
|
||||||
|
|
||||||
self._postfetch_chain.append(
|
self._postfetch_chain.append(
|
||||||
warcprox.ListenerPostfetchProcessor(
|
warcprox.ListenerPostfetchProcessor(
|
||||||
self.proxy.running_stats, self.options))
|
self.proxy.running_stats, self.options))
|
||||||
|
|
||||||
for qualname in self.options.plugins or []:
|
|
||||||
plugin = Factory.plugin(qualname)
|
|
||||||
self._postfetch_chain.append(
|
|
||||||
warcprox.ListenerPostfetchProcessor(plugin, self.options))
|
|
||||||
|
|
||||||
# chain them all up
|
# chain them all up
|
||||||
self._postfetch_chain[0].inq = inq
|
self._postfetch_chain[0].inq = inq
|
||||||
for i in range(1, len(self._postfetch_chain)):
|
for i in range(1, len(self._postfetch_chain)):
|
||||||
|
@ -302,7 +302,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
|
|
||||||
def _filter_and_bucketize(self, batch):
|
def _filter_and_bucketize(self, batch):
|
||||||
'''
|
'''
|
||||||
Returns `{bucket: [recorded_url, ...]}`, excluding urls that should
|
Returns `{bucket: [recorded_url, ...]}`, excluding urls that should not
|
||||||
have dedup info stored.
|
have dedup info stored.
|
||||||
'''
|
'''
|
||||||
buckets = collections.defaultdict(list)
|
buckets = collections.defaultdict(list)
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||||
arguments, initializes components, starts controller, handles signals
|
arguments, initializes components, starts controller, handles signals
|
||||||
|
|
||||||
Copyright (C) 2013-2017 Internet Archive
|
Copyright (C) 2013-2018 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -172,11 +172,7 @@ def _build_arg_parser(prog='warcprox'):
|
|||||||
action='append', help=(
|
action='append', help=(
|
||||||
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
||||||
'May be used multiple times to register multiple plugins. '
|
'May be used multiple times to register multiple plugins. '
|
||||||
'Plugin classes are loaded from the regular python module '
|
'See README.rst for more information.'))
|
||||||
'search path. They will be instantiated with no arguments and '
|
|
||||||
'must have a method `notify(self, recorded_url, records)` '
|
|
||||||
'which will be called for each url, after warc records have '
|
|
||||||
'been written.'))
|
|
||||||
arg_parser.add_argument('--version', action='version',
|
arg_parser.add_argument('--version', action='version',
|
||||||
version="warcprox {}".format(warcprox.__version__))
|
version="warcprox {}".format(warcprox.__version__))
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user