mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
dropping claim of support for python 2.7 (not worth hacking around tempfile.TemporaryDirectory to make tests pass)
This commit is contained in:
parent
500ffad7e4
commit
eacf070a2a
@ -15,6 +15,8 @@ matrix:
|
|||||||
allow_failures:
|
allow_failures:
|
||||||
- python: nightly
|
- python: nightly
|
||||||
- python: 3.7-dev
|
- python: 3.7-dev
|
||||||
|
- python: 2.7
|
||||||
|
- python: pypy
|
||||||
|
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
101
README.rst
101
README.rst
@ -9,7 +9,7 @@ https://github.com/allfro/pymiproxy
|
|||||||
Install
|
Install
|
||||||
~~~~~~~
|
~~~~~~~
|
||||||
|
|
||||||
Warcprox runs on python 2.7 or 3.4+.
|
Warcprox runs on python 3.4+.
|
||||||
|
|
||||||
To install latest release run:
|
To install latest release run:
|
||||||
|
|
||||||
@ -41,16 +41,18 @@ Usage
|
|||||||
|
|
||||||
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||||
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
||||||
[-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
|
[-s ROLLOVER_SIZE]
|
||||||
|
[--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||||
[-g DIGEST_ALGORITHM] [--base32]
|
[-g DIGEST_ALGORITHM] [--base32]
|
||||||
[--method-filter HTTP_METHOD]
|
[--method-filter HTTP_METHOD]
|
||||||
[--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT]
|
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
|
||||||
|
[-P PLAYBACK_PORT]
|
||||||
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
|
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
|
||||||
[-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS]
|
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
|
||||||
[--cdxserver-dedup CDX_SERVER_URL]
|
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
|
||||||
[--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table]
|
|
||||||
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
||||||
[--plugin PLUGIN_CLASS] [--version] [-v] [--trace] [-q]
|
[--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
|
||||||
|
[--version] [-v] [--trace] [-q]
|
||||||
|
|
||||||
warcprox - WARC writing MITM HTTP/S proxy
|
warcprox - WARC writing MITM HTTP/S proxy
|
||||||
|
|
||||||
@ -61,35 +63,38 @@ Usage
|
|||||||
address to listen on (default: localhost)
|
address to listen on (default: localhost)
|
||||||
-c CACERT, --cacert CACERT
|
-c CACERT, --cacert CACERT
|
||||||
CA certificate file; if file does not exist, it
|
CA certificate file; if file does not exist, it
|
||||||
will be created (default:
|
will be created (default: ./ayutla.local-warcprox-
|
||||||
./ayutla.monkeybrains.net-warcprox-ca.pem)
|
ca.pem)
|
||||||
--certs-dir CERTS_DIR
|
--certs-dir CERTS_DIR
|
||||||
where to store and load generated certificates
|
where to store and load generated certificates
|
||||||
(default: ./ayutla.monkeybrains.net-warcprox-ca)
|
(default: ./ayutla.local-warcprox-ca)
|
||||||
-d DIRECTORY, --dir DIRECTORY
|
-d DIRECTORY, --dir DIRECTORY
|
||||||
where to write warcs (default: ./warcs)
|
where to write warcs (default: ./warcs)
|
||||||
-z, --gzip write gzip-compressed warc records
|
-z, --gzip write gzip-compressed warc records
|
||||||
-n PREFIX, --prefix PREFIX
|
-n PREFIX, --prefix PREFIX
|
||||||
WARC filename prefix (default: WARCPROX)
|
default WARC filename prefix (default: WARCPROX)
|
||||||
-s SIZE, --size SIZE WARC file rollover size threshold in bytes
|
-s ROLLOVER_SIZE, --size ROLLOVER_SIZE
|
||||||
|
WARC file rollover size threshold in bytes
|
||||||
(default: 1000000000)
|
(default: 1000000000)
|
||||||
--rollover-idle-time ROLLOVER_IDLE_TIME
|
--rollover-idle-time ROLLOVER_IDLE_TIME
|
||||||
WARC file rollover idle time threshold in seconds
|
WARC file rollover idle time threshold in seconds
|
||||||
(so that Friday's last open WARC doesn't sit
|
(so that Friday's last open WARC doesn't sit there
|
||||||
there all weekend waiting for more data)
|
all weekend waiting for more data) (default: None)
|
||||||
(default: None)
|
|
||||||
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
||||||
digest algorithm, one of sha1, sha384, sha512,
|
digest algorithm, one of sha256, sha224, sha512,
|
||||||
md5, sha224, sha256 (default: sha1)
|
sha384, md5, sha1 (default: sha1)
|
||||||
--base32 write digests in Base32 instead of hex
|
--base32 write digests in Base32 instead of hex
|
||||||
--method-filter HTTP_METHOD
|
--method-filter HTTP_METHOD
|
||||||
only record requests with the given http
|
only record requests with the given http method(s)
|
||||||
method(s) (can be used more than once) (default:
|
(can be used more than once) (default: None)
|
||||||
None)
|
|
||||||
--stats-db-file STATS_DB_FILE
|
--stats-db-file STATS_DB_FILE
|
||||||
persistent statistics database file; empty string
|
persistent statistics database file; empty string
|
||||||
or /dev/null disables statistics tracking
|
or /dev/null disables statistics tracking
|
||||||
(default: ./warcprox.sqlite)
|
(default: ./warcprox.sqlite)
|
||||||
|
--rethinkdb-stats-url RETHINKDB_STATS_URL
|
||||||
|
rethinkdb stats table url, e.g. rethinkdb://db0.fo
|
||||||
|
o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
|
||||||
|
ble (default: None)
|
||||||
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
||||||
port to listen on for instant playback (default:
|
port to listen on for instant playback (default:
|
||||||
None)
|
None)
|
||||||
@ -101,36 +106,44 @@ Usage
|
|||||||
persistent deduplication database file; empty
|
persistent deduplication database file; empty
|
||||||
string or /dev/null disables deduplication
|
string or /dev/null disables deduplication
|
||||||
(default: ./warcprox.sqlite)
|
(default: ./warcprox.sqlite)
|
||||||
--cdxserver-dedup CDX_SERVER_URL
|
--rethinkdb-dedup-url RETHINKDB_DEDUP_URL
|
||||||
use a CDX server for deduplication
|
rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
|
||||||
|
db1.foo.org:38015/my_warcprox_db/my_dedup_table
|
||||||
(default: None)
|
(default: None)
|
||||||
--rethinkdb-servers RETHINKDB_SERVERS
|
--rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
|
||||||
rethinkdb servers, used for dedup and stats if
|
rethinkdb big table url (table will be populated
|
||||||
specified; e.g.
|
with various capture information and is suitable
|
||||||
db0.foo.org,db0.foo.org:38015,db1.foo.org
|
for use as index for playback), e.g. rethinkdb://d
|
||||||
(default: None)
|
b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
|
||||||
--rethinkdb-db RETHINKDB_DB
|
es (default: None)
|
||||||
rethinkdb database name (ignored unless
|
--rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
|
||||||
--rethinkdb-servers is specified) (default:
|
🐷 url pointing to trough configuration rethinkdb
|
||||||
warcprox)
|
database, e.g. rethinkdb://db0.foo.org,db1.foo.org
|
||||||
--rethinkdb-big-table
|
:38015/trough_configuration (default: None)
|
||||||
use a big rethinkdb table called "captures",
|
--cdxserver-dedup CDXSERVER_DEDUP
|
||||||
instead of a small table called "dedup"; table is
|
use a CDX Server URL for deduplication; e.g.
|
||||||
suitable for use as index for playback (ignored
|
https://web.archive.org/cdx/search (default: None)
|
||||||
unless --rethinkdb-servers is specified)
|
--rethinkdb-services-url RETHINKDB_SERVICES_URL
|
||||||
|
rethinkdb service registry table url; if provided,
|
||||||
|
warcprox will create and heartbeat entry for
|
||||||
|
itself (default: None)
|
||||||
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
|
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
|
||||||
host:port of tor socks proxy, used only to
|
host:port of tor socks proxy, used only to connect
|
||||||
connect to .onion sites (default: None)
|
to .onion sites (default: None)
|
||||||
|
--crawl-log-dir CRAWL_LOG_DIR
|
||||||
|
if specified, write crawl log files in the
|
||||||
|
specified directory; one crawl log is written per
|
||||||
|
warc filename prefix; crawl log format mimics
|
||||||
|
heritrix (default: None)
|
||||||
--plugin PLUGIN_CLASS
|
--plugin PLUGIN_CLASS
|
||||||
Qualified name of plugin class, e.g.
|
Qualified name of plugin class, e.g.
|
||||||
"mypkg.mymod.MyClass". May be used multiple times
|
"mypkg.mymod.MyClass". May be used multiple times
|
||||||
to register multiple plugins. Plugin classes are
|
to register multiple plugins. Plugin classes are
|
||||||
loaded from the regular python module search
|
loaded from the regular python module search path.
|
||||||
path. They will be instantiated with no arguments
|
They will be instantiated with no arguments and
|
||||||
and must have a method `notify(self,
|
must have a method `notify(self, recorded_url,
|
||||||
recorded_url, records)` which will be called for
|
records)` which will be called for each url, after
|
||||||
each url, after warc records have been written.
|
warc records have been written. (default: None)
|
||||||
(default: None)
|
|
||||||
--version show program's version number and exit
|
--version show program's version number and exit
|
||||||
-v, --verbose
|
-v, --verbose
|
||||||
--trace
|
--trace
|
||||||
|
3
setup.py
3
setup.py
@ -52,7 +52,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.3.1b4.dev130',
|
version='2.3.1b4.dev131',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
@ -76,7 +76,6 @@ setuptools.setup(
|
|||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||||
'Programming Language :: Python :: 2.7',
|
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.4',
|
||||||
'Programming Language :: Python :: 3.5',
|
'Programming Language :: Python :: 3.5',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
|
@ -183,9 +183,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
def _proxy_request(self):
|
def _proxy_request(self):
|
||||||
warcprox_meta = None
|
warcprox_meta = None
|
||||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||||
self.logger.log(
|
self.logger.trace(
|
||||||
warcprox.TRACE, 'request for %s Warcprox-Meta header: %s',
|
'request for %s Warcprox-Meta header: %s', self.url,
|
||||||
self.url, repr(raw_warcprox_meta))
|
raw_warcprox_meta)
|
||||||
if raw_warcprox_meta:
|
if raw_warcprox_meta:
|
||||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||||
del self.headers['Warcprox-Meta']
|
del self.headers['Warcprox-Meta']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user