mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
commit
a1356709df
186
README.rst
186
README.rst
@ -1,186 +0,0 @@
|
|||||||
warcprox - WARC writing MITM HTTP/S proxy
|
|
||||||
-----------------------------------------
|
|
||||||
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
|
||||||
:target: https://travis-ci.org/internetarchive/warcprox
|
|
||||||
|
|
||||||
Based on the excellent and simple pymiproxy by Nadeem Douba.
|
|
||||||
https://github.com/allfro/pymiproxy
|
|
||||||
|
|
||||||
Install
|
|
||||||
~~~~~~~
|
|
||||||
|
|
||||||
Warcprox runs on python 3.4+.
|
|
||||||
|
|
||||||
To install latest release run:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
# apt-get install libffi-dev libssl-dev
|
|
||||||
pip install warcprox
|
|
||||||
|
|
||||||
You can also install the latest bleeding edge code:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
pip install git+https://github.com/internetarchive/warcprox.git
|
|
||||||
|
|
||||||
|
|
||||||
Trusting the CA cert
|
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
For best results while browsing through warcprox, you need to add the CA
|
|
||||||
cert as a trusted cert in your browser. If you don't do that, you will
|
|
||||||
get the warning when you visit each new site. But worse, any embedded
|
|
||||||
https content on a different server will simply fail to load, because
|
|
||||||
the browser will reject the certificate without telling you.
|
|
||||||
|
|
||||||
Plugins
|
|
||||||
~~~~~~~
|
|
||||||
|
|
||||||
Warcprox supports a limited notion of plugins by way of the `--plugin` command
|
|
||||||
line argument. Plugin classes are loaded from the regular python module search
|
|
||||||
path. They will be instantiated with one argument, a `warcprox.Options`, which
|
|
||||||
holds the values of all the command line arguments. Legacy plugins with
|
|
||||||
constructors that take no arguments are also supported. Plugins should either
|
|
||||||
have a method `notify(self, recorded_url, records)` or should subclass
|
|
||||||
`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
|
|
||||||
specifying `--plugin` multiples times.
|
|
||||||
|
|
||||||
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
|
|
||||||
|
|
||||||
Usage
|
|
||||||
~~~~~
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
|
||||||
[--certs-dir CERTS_DIR] [-d DIRECTORY]
|
|
||||||
[--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
|
|
||||||
[-s ROLLOVER_SIZE]
|
|
||||||
[--rollover-idle-time ROLLOVER_IDLE_TIME]
|
|
||||||
[-g DIGEST_ALGORITHM] [--base32]
|
|
||||||
[--method-filter HTTP_METHOD]
|
|
||||||
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
|
|
||||||
[-P PLAYBACK_PORT]
|
|
||||||
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
|
|
||||||
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
|
|
||||||
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
|
||||||
[--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
|
|
||||||
[--version] [-v] [--trace] [-q]
|
|
||||||
|
|
||||||
warcprox - WARC writing MITM HTTP/S proxy
|
|
||||||
|
|
||||||
optional arguments:
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
-p PORT, --port PORT port to listen on (default: 8000)
|
|
||||||
-b ADDRESS, --address ADDRESS
|
|
||||||
address to listen on (default: localhost)
|
|
||||||
-c CACERT, --cacert CACERT
|
|
||||||
CA certificate file; if file does not exist, it
|
|
||||||
will be created (default:
|
|
||||||
./ayutla.monkeybrains.net-warcprox-ca.pem)
|
|
||||||
--certs-dir CERTS_DIR
|
|
||||||
where to store and load generated certificates
|
|
||||||
(default: ./ayutla.monkeybrains.net-warcprox-ca)
|
|
||||||
-d DIRECTORY, --dir DIRECTORY
|
|
||||||
where to write warcs (default: ./warcs)
|
|
||||||
--warc-filename WARC_FILENAME
|
|
||||||
define custom WARC filename with variables
|
|
||||||
{prefix}, {timestamp14}, {timestamp17},
|
|
||||||
{serialno}, {randomtoken}, {hostname},
|
|
||||||
{shorthostname} (default:
|
|
||||||
{prefix}-{timestamp17}-{serialno}-{randomtoken})
|
|
||||||
-z, --gzip write gzip-compressed warc records
|
|
||||||
-n PREFIX, --prefix PREFIX
|
|
||||||
default WARC filename prefix (default: WARCPROX)
|
|
||||||
-s ROLLOVER_SIZE, --size ROLLOVER_SIZE
|
|
||||||
WARC file rollover size threshold in bytes
|
|
||||||
(default: 1000000000)
|
|
||||||
--rollover-idle-time ROLLOVER_IDLE_TIME
|
|
||||||
WARC file rollover idle time threshold in seconds
|
|
||||||
(so that Friday's last open WARC doesn't sit there
|
|
||||||
all weekend waiting for more data) (default: None)
|
|
||||||
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
|
||||||
digest algorithm, one of sha384, sha224, md5,
|
|
||||||
sha256, sha512, sha1 (default: sha1)
|
|
||||||
--base32 write digests in Base32 instead of hex
|
|
||||||
--method-filter HTTP_METHOD
|
|
||||||
only record requests with the given http method(s)
|
|
||||||
(can be used more than once) (default: None)
|
|
||||||
--stats-db-file STATS_DB_FILE
|
|
||||||
persistent statistics database file; empty string
|
|
||||||
or /dev/null disables statistics tracking
|
|
||||||
(default: ./warcprox.sqlite)
|
|
||||||
--rethinkdb-stats-url RETHINKDB_STATS_URL
|
|
||||||
rethinkdb stats table url, e.g. rethinkdb://db0.fo
|
|
||||||
o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
|
|
||||||
ble (default: None)
|
|
||||||
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
|
||||||
port to listen on for instant playback (default:
|
|
||||||
None)
|
|
||||||
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
|
||||||
persistent deduplication database file; empty
|
|
||||||
string or /dev/null disables deduplication
|
|
||||||
(default: ./warcprox.sqlite)
|
|
||||||
--rethinkdb-dedup-url RETHINKDB_DEDUP_URL
|
|
||||||
rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
|
|
||||||
db1.foo.org:38015/my_warcprox_db/my_dedup_table
|
|
||||||
(default: None)
|
|
||||||
--rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
|
|
||||||
rethinkdb big table url (table will be populated
|
|
||||||
with various capture information and is suitable
|
|
||||||
for use as index for playback), e.g. rethinkdb://d
|
|
||||||
b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
|
|
||||||
es (default: None)
|
|
||||||
--rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
|
|
||||||
🐷 url pointing to trough configuration rethinkdb
|
|
||||||
database, e.g. rethinkdb://db0.foo.org,db1.foo.org
|
|
||||||
:38015/trough_configuration (default: None)
|
|
||||||
--cdxserver-dedup CDXSERVER_DEDUP
|
|
||||||
use a CDX Server URL for deduplication; e.g.
|
|
||||||
https://web.archive.org/cdx/search (default: None)
|
|
||||||
--rethinkdb-services-url RETHINKDB_SERVICES_URL
|
|
||||||
rethinkdb service registry table url; if provided,
|
|
||||||
warcprox will create and heartbeat entry for
|
|
||||||
itself (default: None)
|
|
||||||
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
|
|
||||||
host:port of tor socks proxy, used only to connect
|
|
||||||
to .onion sites (default: None)
|
|
||||||
--crawl-log-dir CRAWL_LOG_DIR
|
|
||||||
if specified, write crawl log files in the
|
|
||||||
specified directory; one crawl log is written per
|
|
||||||
warc filename prefix; crawl log format mimics
|
|
||||||
heritrix (default: None)
|
|
||||||
--plugin PLUGIN_CLASS
|
|
||||||
Qualified name of plugin class, e.g.
|
|
||||||
"mypkg.mymod.MyClass". May be used multiple times
|
|
||||||
to register multiple plugins. See README.rst for
|
|
||||||
more information. (default: None)
|
|
||||||
--version show program's version number and exit
|
|
||||||
-v, --verbose
|
|
||||||
--trace
|
|
||||||
-q, --quiet
|
|
||||||
|
|
||||||
License
|
|
||||||
~~~~~~~
|
|
||||||
|
|
||||||
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
|
||||||
GPL.
|
|
||||||
|
|
||||||
* Copyright (C) 2012 Cygnos Corporation
|
|
||||||
* Copyright (C) 2013-2018 Internet Archive
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU General Public License
|
|
||||||
as published by the Free Software Foundation; either version 2
|
|
||||||
of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with this program; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
||||||
|
|
320
api.rst
Normal file
320
api.rst
Normal file
@ -0,0 +1,320 @@
|
|||||||
|
warcprox API
|
||||||
|
************
|
||||||
|
|
||||||
|
Means of interacting with warcprox over http, aside from simply proxying urls.
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
|
||||||
|
``/status`` url
|
||||||
|
===============
|
||||||
|
|
||||||
|
If warcprox is running at localhost:8000, http://localhost:8000/status returns
|
||||||
|
a json blob with a bunch of status info. For example:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ curl -sS http://localhost:8000/status
|
||||||
|
{
|
||||||
|
"rates_5min": {
|
||||||
|
"warc_bytes_per_sec": 0.0,
|
||||||
|
"urls_per_sec": 0.0,
|
||||||
|
"actual_elapsed": 277.2983281612396
|
||||||
|
},
|
||||||
|
"version": "2.4b2.dev174",
|
||||||
|
"load": 0.0,
|
||||||
|
"seconds_behind": 0.0,
|
||||||
|
"threads": 100,
|
||||||
|
"warc_bytes_written": 0,
|
||||||
|
"port": 8000,
|
||||||
|
"postfetch_chain": [
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "SkipFacebookCaptchas"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "BatchTroughLoader"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "WarcWriterProcessor"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "BatchTroughStorer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "RethinkStatsProcessor"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "CrawlLogger"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "TroughFeed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"queued_urls": 0,
|
||||||
|
"processor": "RunningStats"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queue_max_size": 500,
|
||||||
|
"role": "warcprox",
|
||||||
|
"queued_urls": 0,
|
||||||
|
"active_requests": 1,
|
||||||
|
"host": "wbgrp-svc405.us.archive.org",
|
||||||
|
"rates_15min": {
|
||||||
|
"warc_bytes_per_sec": 0.0,
|
||||||
|
"urls_per_sec": 0.0,
|
||||||
|
"actual_elapsed": 876.9885368347168
|
||||||
|
},
|
||||||
|
"unaccepted_requests": 0,
|
||||||
|
"urls_processed": 0,
|
||||||
|
"pid": 18841,
|
||||||
|
"address": "127.0.0.1",
|
||||||
|
"rates_1min": {
|
||||||
|
"warc_bytes_per_sec": 0.0,
|
||||||
|
"urls_per_sec": 0.0,
|
||||||
|
"actual_elapsed": 54.92501664161682
|
||||||
|
},
|
||||||
|
"start_time": 1526690353.4060142
|
||||||
|
}
|
||||||
|
|
||||||
|
``WARCPROX_WRITE_RECORD`` http method
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
To make warcprox write an arbitrary warc record you can send it a special
|
||||||
|
request with http method ``WARCPROX_WRITE_RECORD``. The http request must
|
||||||
|
include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
|
||||||
|
Warcprox will use these to populate the warc record. For example::
|
||||||
|
|
||||||
|
$ ncat --crlf 127.0.0.1 8000 <<EOF
|
||||||
|
> WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
|
||||||
|
> WARC-Type: resource
|
||||||
|
> Content-type: text/plain;charset=utf-8
|
||||||
|
> Content-length: 29
|
||||||
|
>
|
||||||
|
> i am a warc record payload!
|
||||||
|
> EOF
|
||||||
|
HTTP/1.0 204 OK
|
||||||
|
Server: BaseHTTP/0.6 Python/3.6.3
|
||||||
|
Date: Tue, 22 May 2018 19:21:02 GMT
|
||||||
|
|
||||||
|
On success warcprox responds with http status 204. For the request above
|
||||||
|
warcprox will write a warc record that looks like this::
|
||||||
|
|
||||||
|
WARC/1.0
|
||||||
|
WARC-Type: resource
|
||||||
|
WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
|
||||||
|
WARC-Date: 2018-05-21T23:33:31Z
|
||||||
|
WARC-Target-URI: special://url/some?thing
|
||||||
|
WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
|
||||||
|
WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
|
||||||
|
Content-Type: text/plain;charset=utf-8
|
||||||
|
Content-Length: 29
|
||||||
|
|
||||||
|
i am a warc record payload!
|
||||||
|
|
||||||
|
``Warcprox-Meta`` http request header
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
``Warcprox-Meta`` is a special http request header that can be used to pass
|
||||||
|
configuration information and metadata with each proxy request to warcprox. The
|
||||||
|
value is a json blob. There are several fields understood by warcprox, and
|
||||||
|
arbitrary additional fields can be included. If warcprox doesn't recognize a
|
||||||
|
field it simply ignores it. Custom fields may be useful for custom warcprox
|
||||||
|
plugins (see `<readme.rst#plugins>`_).
|
||||||
|
|
||||||
|
Warcprox strips the ``warcprox-meta`` header out before sending the request to
|
||||||
|
remote server, and does not write it in the warc request record.
|
||||||
|
|
||||||
|
Brozzler knows about ``warcprox-meta``. For information on configuring
|
||||||
|
it in brozzler, see
|
||||||
|
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
|
||||||
|
``Warcprox-Meta`` is often a very important part of brozzler job configuration.
|
||||||
|
It is the way url and data limits on jobs, seeds, and hosts are implemented,
|
||||||
|
among other things.
|
||||||
|
|
||||||
|
Warcprox-Meta fields
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
``warc-prefix`` (string)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Specifies a warc filename prefix. Warcprox will write the warc record for this
|
||||||
|
capture, if any, to a warc named accordingly.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
||||||
|
|
||||||
|
``dedup-bucket`` (string)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Specifies the deduplication bucket. For more information about deduplication
|
||||||
|
see `<readme.rst#deduplication>`_.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
|
||||||
|
|
||||||
|
``blocks`` (list)
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
List of url match rules. Url match rules are somewhat described at
|
||||||
|
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
|
||||||
|
and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
|
||||||
|
(TODO: write a better doc and link to it)
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
|
||||||
|
|
||||||
|
If any of the rules match the url being requested, warcprox aborts normal
|
||||||
|
processing and responds with a http ``403``. The http response includes
|
||||||
|
a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
|
||||||
|
which reproduces the value of the match rule that resulted in the block. The
|
||||||
|
presence of the ``warcprox-meta`` response header can be used by the client to
|
||||||
|
distinguish this type of a response from a 403 from the remote site.
|
||||||
|
|
||||||
|
An example::
|
||||||
|
|
||||||
|
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
|
||||||
|
HTTP/1.0 403 Forbidden
|
||||||
|
Server: BaseHTTP/0.6 Python/3.6.3
|
||||||
|
Date: Fri, 25 May 2018 22:46:42 GMT
|
||||||
|
Content-Type: text/plain;charset=utf-8
|
||||||
|
Connection: close
|
||||||
|
Content-Length: 111
|
||||||
|
Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
|
||||||
|
|
||||||
|
request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
|
||||||
|
|
||||||
|
You might be wondering why ``blocks`` is necessary. Why would the warcprox
|
||||||
|
client make a request that it should already know will be blocked by the proxy?
|
||||||
|
The answer is that the request may be initiated somewhere where it's difficult
|
||||||
|
to evaluate the block rules. In particular, this circumstance prevails when the
|
||||||
|
browser controlled by brozzler is requesting images, javascript, css, and so
|
||||||
|
on, embedded in a page.
|
||||||
|
|
||||||
|
``stats`` (dictionary)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
``stats`` is a dictionary with only one field understood by warcprox,
|
||||||
|
``buckets``. The value of ``buckets`` is a list of strings and/or
|
||||||
|
dictionaries. A string signifies the name of the bucket; a dictionary is
|
||||||
|
expected to have at least an item with key ``bucket`` whose value is the name
|
||||||
|
of the bucket. The other currently recognized key is ``tally-domains``, which
|
||||||
|
if supplied should be a list of domains. This instructs warcprox to
|
||||||
|
additionally tally substats of the given bucket by domain.
|
||||||
|
|
||||||
|
See `<readme.rst#statistics>`_ for more information on statistics kept by
|
||||||
|
warcprox.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
|
||||||
|
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||||
|
|
||||||
|
Domain stats are stored in the stats table under the key
|
||||||
|
``"bucket2:foo.bar.com"`` for the latter example. See the following two
|
||||||
|
sections for more examples. The ``soft-limits`` section has an example of a
|
||||||
|
limit on a domain specified in ``tally-domains``.
|
||||||
|
|
||||||
|
``limits`` (dictionary)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Specifies quantitative limits for warcprox to enforce. The structure of the
|
||||||
|
dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
|
||||||
|
format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
|
||||||
|
further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
|
||||||
|
|
||||||
|
If processing a request would result in exceeding a limit, warcprox aborts
|
||||||
|
normal processing and responds with a http ``420 Reached Limit``. The http
|
||||||
|
response includes a ``Warcprox-Meta`` response header with the complete set
|
||||||
|
of statistics for the bucket whose limit has been reached.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
|
||||||
|
HTTP/1.0 420 Reached limit
|
||||||
|
Server: BaseHTTP/0.6 Python/3.6.3
|
||||||
|
Date: Fri, 25 May 2018 23:08:32 GMT
|
||||||
|
Content-Type: text/plain;charset=utf-8
|
||||||
|
Connection: close
|
||||||
|
Content-Length: 77
|
||||||
|
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
|
||||||
|
|
||||||
|
request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
|
||||||
|
|
||||||
|
``soft-limits`` (dictionary)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
From warcprox's perspective ``soft-limits`` work almost exactly the same way
|
||||||
|
as ``limits``. The only difference is that when a soft limit is hit, warcprox
|
||||||
|
response with an http ``430 Reached soft limit`` instead of http ``420``.
|
||||||
|
|
||||||
|
Warcprox clients might treat a ``430`` very differently from a ``420``. From
|
||||||
|
brozzler's perspective, for instance, ``soft-limits`` are very different from
|
||||||
|
``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
|
||||||
|
has been reached, this means that crawling for that seed is finished, and
|
||||||
|
brozzler sets about finalizing the crawl of that seed. On the other hand,
|
||||||
|
brozzler blissfully ignores ``430`` responses, because soft limits only apply
|
||||||
|
to a particular bucket (like a domain), and don't have any effect on crawling
|
||||||
|
of urls that don't fall in that bucket.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
|
||||||
|
HTTP/1.0 430 Reached soft limit
|
||||||
|
Server: BaseHTTP/0.6 Python/3.6.3
|
||||||
|
Date: Fri, 25 May 2018 23:12:06 GMT
|
||||||
|
Content-Type: text/plain;charset=utf-8
|
||||||
|
Connection: close
|
||||||
|
Content-Length: 82
|
||||||
|
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
|
||||||
|
|
||||||
|
request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
|
||||||
|
|
||||||
|
``metadata`` (dictionary)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
|
||||||
|
that if it has a ``seed`` entry and crawl logs are enabled via the
|
||||||
|
``--crawl-log-dir`` command line option, the value of ``seed`` is written to
|
||||||
|
the crawl log as the 11th field on the line, simulating heritrix's "source
|
||||||
|
tag".
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
|
||||||
|
|
||||||
|
``accept`` (list)
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
Specifies fields that the client would like to receive in the ``Warcprox-Meta``
|
||||||
|
response header. Only one value is currently understood,
|
||||||
|
``capture-metadata``.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"accept": ["capture-metadata"]}
|
||||||
|
|
||||||
|
The response will include a ``Warcprox-Meta`` response header with one field
|
||||||
|
also called ``captured-metadata``. Currently warcprox reports one piece of
|
||||||
|
capture medata, ``timestamp``, which represents the time fetch began for the
|
||||||
|
resource and matches the ``WARC-Date`` written to the warc record. For
|
||||||
|
example::
|
||||||
|
|
||||||
|
Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
|
||||||
|
|
||||||
|
``Warcprox-Meta`` http response header
|
||||||
|
======================================
|
||||||
|
In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
|
||||||
|
that it sends to the client. As with the request header, the value is a json
|
||||||
|
blob. It is only included if something in the ``warcprox-meta`` request header
|
||||||
|
calls for it. Those cases are described above in the `Warcprox-Meta http
|
||||||
|
request header`_ section.
|
||||||
|
|
173
readme.rst
Normal file
173
readme.rst
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
Warcprox - WARC writing MITM HTTP/S proxy
|
||||||
|
*****************************************
|
||||||
|
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
||||||
|
:target: https://travis-ci.org/internetarchive/warcprox
|
||||||
|
|
||||||
|
Warcprox is a tool for archiving the web. It is an http proxy that stores its
|
||||||
|
traffic to disk in `WARC
|
||||||
|
<https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
|
||||||
|
format. Warcprox captures encrypted https traffic by using the
|
||||||
|
`"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
|
||||||
|
technique (see the `Man-in-the-middle`_ section for more info).
|
||||||
|
|
||||||
|
The web pages that warcprox stores in WARC files can be played back using
|
||||||
|
software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
|
||||||
|
<https://github.com/webrecorder/pywb>`_. Warcprox has been developed in
|
||||||
|
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ and
|
||||||
|
together they make a comprehensive modern distributed archival web crawling
|
||||||
|
system.
|
||||||
|
|
||||||
|
Warcprox was originally based on the excellent and simple pymiproxy by Nadeem
|
||||||
|
Douba. https://github.com/allfro/pymiproxy
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
|
||||||
|
Getting started
|
||||||
|
===============
|
||||||
|
Warcprox runs on python 3.4+.
|
||||||
|
|
||||||
|
To install latest release run::
|
||||||
|
|
||||||
|
# apt-get install libffi-dev libssl-dev
|
||||||
|
pip install warcprox
|
||||||
|
|
||||||
|
You can also install the latest bleeding edge code::
|
||||||
|
|
||||||
|
pip install git+https://github.com/internetarchive/warcprox.git
|
||||||
|
|
||||||
|
To start warcprox run::
|
||||||
|
|
||||||
|
warcprox
|
||||||
|
|
||||||
|
Try ``warcprox --help`` for documentation on command line options.
|
||||||
|
|
||||||
|
Man-in-the-middle
|
||||||
|
=================
|
||||||
|
Normally, http proxies can't read https traffic, because it's encrypted. The
|
||||||
|
browser uses the http ``CONNECT`` method to establish a tunnel through the
|
||||||
|
proxy, and the proxy merely routes raw bytes between the client and server.
|
||||||
|
Since the bytes are encrypted, the proxy can't make sense of the information
|
||||||
|
it's proxying. This nonsensical encrypted data would not be very useful to
|
||||||
|
archive.
|
||||||
|
|
||||||
|
In order to capture https traffic, warcprox acts as a "man-in-the-middle"
|
||||||
|
(MITM). When it receives a ``CONNECT`` directive from a client, it generates a
|
||||||
|
public key certificate for the requested site, presents to the client, and
|
||||||
|
proceeds to establish an encrypted connection with the client. Then it makes a
|
||||||
|
separate, normal https connection to the remote site. It decrypts, archives,
|
||||||
|
and re-encrypts traffic in both directions.
|
||||||
|
|
||||||
|
Although "man-in-the-middle" is often paired with "attack", there is nothing
|
||||||
|
malicious about what warcprox is doing. If you configure an instance of
|
||||||
|
warcprox as your browser's http proxy, you will see lots of certificate
|
||||||
|
warnings, since none of the certificates will be signed by trusted authorities.
|
||||||
|
To use warcprox effectively the client needs to disable certificate
|
||||||
|
verification, or add the CA cert generated by warcprox as a trusted authority.
|
||||||
|
(If you do this in your browser, make sure you undo it when you're done using
|
||||||
|
warcprox!)
|
||||||
|
|
||||||
|
API
|
||||||
|
===
|
||||||
|
For interacting with a running instance of warcprox.
|
||||||
|
|
||||||
|
* ``/status`` url
|
||||||
|
* ``WARCPROX_WRITE_RECORD`` http method
|
||||||
|
* ``Warcprox-Meta`` http request header and response header
|
||||||
|
|
||||||
|
See `<api.rst>`_.
|
||||||
|
|
||||||
|
Deduplication
|
||||||
|
=============
|
||||||
|
Warcprox avoids archiving redundant content by "deduplicating" it. The process
|
||||||
|
for deduplication works similarly to heritrix and other web archiving tools.
|
||||||
|
|
||||||
|
1. while fetching url, calculate payload content digest (typically sha1)
|
||||||
|
2. look up digest in deduplication database (warcprox supports a few different
|
||||||
|
ones)
|
||||||
|
3. if found, write warc ``revisit`` record referencing the url and capture time
|
||||||
|
of the previous capture
|
||||||
|
4. else (if not found),
|
||||||
|
|
||||||
|
a. write warc ``response`` record with full payload
|
||||||
|
b. store entry in deduplication database
|
||||||
|
|
||||||
|
The dedup database is partitioned into different "buckets". Urls are
|
||||||
|
deduplicated only against other captures in the same bucket. If specified, the
|
||||||
|
``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines
|
||||||
|
the bucket, otherwise the default bucket is used.
|
||||||
|
|
||||||
|
Deduplication can be disabled entirely by starting warcprox with the argument
|
||||||
|
``--dedup-db-file=/dev/null``.
|
||||||
|
|
||||||
|
Statistics
|
||||||
|
==========
|
||||||
|
Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
|
||||||
|
These are consulted for enforcing ``limits`` and ``soft-limits`` (see
|
||||||
|
`<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
|
||||||
|
processes outside of warcprox, for reporting etc.
|
||||||
|
|
||||||
|
Statistics are grouped by "bucket". Every capture is counted as part of the
|
||||||
|
``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
|
||||||
|
request header. The fallback bucket in case none is specified is called
|
||||||
|
``__unspecified__``.
|
||||||
|
|
||||||
|
Within each bucket are three sub-buckets:
|
||||||
|
|
||||||
|
* ``new`` - tallies captures for which a complete record (usually a ``response``
|
||||||
|
record) was written to warc
|
||||||
|
* ``revisit`` - tallies captures for which a ``revisit`` record was written to
|
||||||
|
warc
|
||||||
|
* ``total`` - includes all urls processed, even those not written to warc (so the
|
||||||
|
numbers may be greater than new + revisit)
|
||||||
|
|
||||||
|
Within each of these sub-buckets we keep two statistics:
|
||||||
|
|
||||||
|
* ``urls`` - simple count of urls
|
||||||
|
* ``wire_bytes`` - sum of bytes received over the wire, including http headers,
|
||||||
|
from the remote server for each url
|
||||||
|
|
||||||
|
For historical reasons, in sqlite, the default store, statistics are kept as
|
||||||
|
json blobs::
|
||||||
|
|
||||||
|
sqlite> select * from buckets_of_stats;
|
||||||
|
bucket stats
|
||||||
|
--------------- ---------------------------------------------------------------------------------------------
|
||||||
|
__unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
|
||||||
|
__all__ {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
|
||||||
|
|
||||||
|
Plugins
|
||||||
|
=======
|
||||||
|
Warcprox supports a limited notion of plugins by way of the ``--plugin``
|
||||||
|
command line argument. Plugin classes are loaded from the regular python module
|
||||||
|
search path. They will be instantiated with one argument, a
|
||||||
|
``warcprox.Options``, which holds the values of all the command line arguments.
|
||||||
|
Legacy plugins with constructors that take no arguments are also supported.
|
||||||
|
Plugins should either have a method ``notify(self, recorded_url, records)`` or
|
||||||
|
should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can
|
||||||
|
be configured by specifying ``--plugin`` multiples times.
|
||||||
|
|
||||||
|
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
|
||||||
|
|
||||||
|
License
|
||||||
|
=======
|
||||||
|
|
||||||
|
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
||||||
|
GPL.
|
||||||
|
|
||||||
|
* Copyright (C) 2012 Cygnos Corporation
|
||||||
|
* Copyright (C) 2013-2018 Internet Archive
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU General Public License
|
||||||
|
as published by the Free Software Foundation; either version 2
|
||||||
|
of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
|
2
setup.py
2
setup.py
@ -45,7 +45,7 @@ setuptools.setup(
|
|||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('README.rst').read(),
|
long_description=open('readme.rst').read(),
|
||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['warcprox'],
|
packages=['warcprox'],
|
||||||
install_requires=deps,
|
install_requires=deps,
|
||||||
|
@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
|||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
|
||||||
|
|
||||||
|
# next fetch hits the limit
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 420
|
assert response.status_code == 420
|
||||||
assert response.reason == "Reached limit"
|
assert response.reason == "Reached limit"
|
||||||
@ -717,6 +718,17 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
|||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
|
||||||
|
|
||||||
|
# make sure limit doesn't get applied to a different stats bucket
|
||||||
|
request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
|
||||||
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'i!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11)
|
||||||
|
|
||||||
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
|
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
@ -726,14 +738,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
|
|||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['Warcprox-Meta']
|
assert response.headers['Warcprox-Meta']
|
||||||
data = json.loads(response.headers['Warcprox-Meta'])
|
response_meta = json.loads(response.headers['Warcprox-Meta'])
|
||||||
assert data['capture-metadata']
|
assert response_meta['capture-metadata']
|
||||||
try:
|
try:
|
||||||
dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
|
dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'],
|
||||||
'%Y-%m-%dT%H:%M:%SZ')
|
'%Y-%m-%dT%H:%M:%SZ')
|
||||||
assert dt
|
assert dt
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
|
pytest.fail(
|
||||||
|
'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r',
|
||||||
|
meta['capture-metadata']['timestamp'])
|
||||||
|
|
||||||
# wait for postfetch chain (or subsequent test could fail)
|
# wait for postfetch chain (or subsequent test could fail)
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||||
@ -997,6 +1011,7 @@ def test_domain_doc_soft_limit(
|
|||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
# ** comment is obsolete (server is multithreaded) but still useful **
|
||||||
# we need to clear the connection pool here because
|
# we need to clear the connection pool here because
|
||||||
# - connection pool already may already have an open connection localhost
|
# - connection pool already may already have an open connection localhost
|
||||||
# - we're about to make a connection to foo.localhost
|
# - we're about to make a connection to foo.localhost
|
||||||
@ -1132,6 +1147,23 @@ def test_domain_doc_soft_limit(
|
|||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||||
|
|
||||||
|
# make sure soft limit doesn't get applied to a different stats bucket
|
||||||
|
request_meta = {
|
||||||
|
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]},
|
||||||
|
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
|
||||||
|
}
|
||||||
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
|
verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'o!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
|
||||||
|
|
||||||
def test_domain_data_soft_limit(
|
def test_domain_data_soft_limit(
|
||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
@ -1226,6 +1258,22 @@ def test_domain_data_soft_limit(
|
|||||||
### assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
### assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
|
### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
|
||||||
|
|
||||||
|
# make sure soft limit doesn't get applied to a different stats bucket
|
||||||
|
request_meta = {
|
||||||
|
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]},
|
||||||
|
"soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200},
|
||||||
|
}
|
||||||
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'y!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
|
||||||
|
|
||||||
|
# wait for postfetch chain
|
||||||
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
||||||
|
|
||||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||||
# connection to the internet, and relies on a third party site (facebook) being
|
# connection to the internet, and relies on a third party site (facebook) being
|
||||||
# up and behaving a certain way
|
# up and behaving a certain way
|
||||||
|
@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'):
|
|||||||
action='append', help=(
|
action='append', help=(
|
||||||
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
||||||
'May be used multiple times to register multiple plugins. '
|
'May be used multiple times to register multiple plugins. '
|
||||||
'See README.rst for more information.'))
|
'See readme.rst for more information.'))
|
||||||
arg_parser.add_argument('--version', action='version',
|
arg_parser.add_argument('--version', action='version',
|
||||||
version="warcprox {}".format(warcprox.__version__))
|
version="warcprox {}".format(warcprox.__version__))
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||||
|
@ -53,6 +53,53 @@ def _empty_bucket(bucket):
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def unravel_buckets(url, warcprox_meta):
|
||||||
|
'''
|
||||||
|
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||||
|
definition can either be a string, which signifies the name of the
|
||||||
|
bucket, or a dict. If a dict it is expected to have at least an item
|
||||||
|
with key 'bucket' whose value is the name of the bucket. The other
|
||||||
|
currently recognized item is 'tally-domains', which if supplied should
|
||||||
|
be a list of domains. This instructs warcprox to additionally tally
|
||||||
|
substats of the given bucket by domain. Host stats are stored in the
|
||||||
|
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of strings
|
||||||
|
|
||||||
|
Example Warcprox-Meta header (a real one will likely have other
|
||||||
|
sections besides 'stats'):
|
||||||
|
|
||||||
|
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||||
|
|
||||||
|
In this case the return value would be
|
||||||
|
["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
|
||||||
|
'''
|
||||||
|
buckets = ["__all__"]
|
||||||
|
if (warcprox_meta and "stats" in warcprox_meta
|
||||||
|
and "buckets" in warcprox_meta["stats"]):
|
||||||
|
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||||
|
if isinstance(bucket, dict):
|
||||||
|
if not 'bucket' in bucket:
|
||||||
|
self.logger.warn(
|
||||||
|
'ignoring invalid stats bucket in '
|
||||||
|
'warcprox-meta header %s', bucket)
|
||||||
|
continue
|
||||||
|
buckets.append(bucket['bucket'])
|
||||||
|
if bucket.get('tally-domains'):
|
||||||
|
canon_url = urlcanon.semantic(url)
|
||||||
|
for domain in bucket['tally-domains']:
|
||||||
|
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||||
|
if urlcanon.url_matches_domain(canon_url, domain):
|
||||||
|
buckets.append(
|
||||||
|
'%s:%s' % (bucket['bucket'], domain))
|
||||||
|
else:
|
||||||
|
buckets.append(bucket)
|
||||||
|
else:
|
||||||
|
buckets.append("__unspecified__")
|
||||||
|
|
||||||
|
return buckets
|
||||||
|
|
||||||
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||||
logger = logging.getLogger("warcprox.stats.StatsProcessor")
|
logger = logging.getLogger("warcprox.stats.StatsProcessor")
|
||||||
|
|
||||||
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def buckets(self, recorded_url):
|
def buckets(self, recorded_url):
|
||||||
'''
|
return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
|
||||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
|
||||||
definition can either be a string, which signifies the name of the
|
|
||||||
bucket, or a dict. If a dict it is expected to have at least an item
|
|
||||||
with key 'bucket' whose value is the name of the bucket. The other
|
|
||||||
currently recognized item is 'tally-domains', which if supplied should
|
|
||||||
be a list of domains. This instructs warcprox to additionally tally
|
|
||||||
substats of the given bucket by domain. Host stats are stored in the
|
|
||||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
|
||||||
|
|
||||||
Example Warcprox-Meta header (a real one will likely have other
|
|
||||||
sections besides 'stats'):
|
|
||||||
|
|
||||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
|
|
||||||
'''
|
|
||||||
buckets = ["__all__"]
|
|
||||||
if (recorded_url.warcprox_meta
|
|
||||||
and "stats" in recorded_url.warcprox_meta
|
|
||||||
and "buckets" in recorded_url.warcprox_meta["stats"]):
|
|
||||||
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
|
|
||||||
if isinstance(bucket, dict):
|
|
||||||
if not 'bucket' in bucket:
|
|
||||||
self.logger.warn(
|
|
||||||
'ignoring invalid stats bucket in '
|
|
||||||
'warcprox-meta header %s', bucket)
|
|
||||||
continue
|
|
||||||
buckets.append(bucket['bucket'])
|
|
||||||
if bucket.get('tally-domains'):
|
|
||||||
url = urlcanon.semantic(recorded_url.url)
|
|
||||||
for domain in bucket['tally-domains']:
|
|
||||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
|
||||||
if urlcanon.url_matches_domain(url, domain):
|
|
||||||
buckets.append(
|
|
||||||
'%s:%s' % (bucket['bucket'], domain))
|
|
||||||
else:
|
|
||||||
buckets.append(bucket)
|
|
||||||
else:
|
|
||||||
buckets.append("__unspecified__")
|
|
||||||
|
|
||||||
return buckets
|
|
||||||
|
|
||||||
class RethinkStatsProcessor(StatsProcessor):
|
class RethinkStatsProcessor(StatsProcessor):
|
||||||
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
|
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
|
||||||
@ -301,11 +309,9 @@ class RunningStats:
|
|||||||
need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
|
need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
|
||||||
if need_minute_snap:
|
if need_minute_snap:
|
||||||
self.minute_snaps.append((now, self.urls, self.warc_bytes))
|
self.minute_snaps.append((now, self.urls, self.warc_bytes))
|
||||||
logging.debug('added minute snap %r', self.minute_snaps[-1])
|
|
||||||
if need_ten_sec_snap:
|
if need_ten_sec_snap:
|
||||||
self.ten_sec_snaps.popleft()
|
self.ten_sec_snaps.popleft()
|
||||||
self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
|
self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
|
||||||
logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1])
|
|
||||||
|
|
||||||
def _closest_ten_sec_snap(self, t):
|
def _closest_ten_sec_snap(self, t):
|
||||||
# it's a deque so iterating over it is faster than indexed lookup
|
# it's a deque so iterating over it is faster than indexed lookup
|
||||||
|
@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
block_rule = urlcanon.MatchRule(**rule)
|
block_rule = urlcanon.MatchRule(**rule)
|
||||||
if block_rule.applies(url):
|
if block_rule.applies(url):
|
||||||
body = ("request rejected by warcprox: blocked by "
|
body = ("request rejected by warcprox: blocked by "
|
||||||
"rule found in Warcprox-Meta header: %s"
|
"rule found in Warcprox-Meta header: %s\n"
|
||||||
% rule).encode("utf-8")
|
% json.dumps(rule)).encode("utf-8")
|
||||||
self.send_response(403, "Forbidden")
|
self.send_response(403, "Forbidden")
|
||||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||||
self.send_header("Connection", "close")
|
self.send_header("Connection", "close")
|
||||||
self.send_header("Content-Length", len(body))
|
self.send_header("Content-Length", len(body))
|
||||||
response_meta = {"blocked-by-rule":rule}
|
response_meta = {"blocked-by-rule": rule}
|
||||||
self.send_header(
|
self.send_header(
|
||||||
"Warcprox-Meta",
|
"Warcprox-Meta",
|
||||||
json.dumps(response_meta, separators=(",",":")))
|
json.dumps(response_meta, separators=(",",":")))
|
||||||
@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self.client_address[0], self.command,
|
self.client_address[0], self.command,
|
||||||
self.url, rule))
|
self.url, rule))
|
||||||
|
|
||||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
|
||||||
if not self.server.stats_db:
|
if not self.server.stats_db:
|
||||||
return
|
return
|
||||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
|
||||||
_limit_key = limit_key
|
|
||||||
|
|
||||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
# parse limit key
|
||||||
# to apply this rule if the requested url is within domain
|
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||||
bucket0_fields = bucket0.split(':')
|
# normalize domain if part of bucket
|
||||||
if len(bucket0_fields) == 2:
|
if ":" in bucket0:
|
||||||
domain = urlcanon.normalize_host(bucket0_fields[1])
|
b, raw_domain = bucket0.split(":", 1)
|
||||||
if not urlcanon.host_matches_domain(self.hostname, domain):
|
domain = urlcanon.normalize_host(raw_domain).decode("ascii")
|
||||||
return # else host matches, go ahead and enforce the limit
|
bucket0 = "%s:%s" % (b, domain)
|
||||||
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
|
limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
|
||||||
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
|
||||||
|
if not bucket0 in buckets:
|
||||||
|
return
|
||||||
|
|
||||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||||
if value and limit_value and limit_value > 0 and value >= limit_value:
|
if value and limit_value and limit_value > 0 and value >= limit_value:
|
||||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||||
"soft limit" if soft else "limit", _limit_key,
|
"soft limit" if soft else "limit", limit_key,
|
||||||
limit_value)).encode("utf-8")
|
limit_value)).encode("utf-8")
|
||||||
if soft:
|
if soft:
|
||||||
self.send_response(430, "Reached soft limit")
|
self.send_response(430, "Reached soft limit")
|
||||||
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||||
}
|
}
|
||||||
if soft:
|
if soft:
|
||||||
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
|
response_meta["reached-soft-limit"] = {limit_key:limit_value}
|
||||||
else:
|
else:
|
||||||
response_meta["reached-limit"] = {_limit_key:limit_value}
|
response_meta["reached-limit"] = {limit_key:limit_value}
|
||||||
self.send_header(
|
self.send_header(
|
||||||
"Warcprox-Meta",
|
"Warcprox-Meta", json.dumps(response_meta, separators=",:"))
|
||||||
json.dumps(response_meta, separators=(",",":")))
|
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
if self.command != "HEAD":
|
if self.command != "HEAD":
|
||||||
self.wfile.write(body)
|
self.wfile.write(body)
|
||||||
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self.client_address[0], 430 if soft else 420,
|
self.client_address[0], 430 if soft else 420,
|
||||||
self.command, self.url,
|
self.command, self.url,
|
||||||
"soft limit" if soft else "limit",
|
"soft limit" if soft else "limit",
|
||||||
_limit_key, limit_value))
|
limit_key, limit_value))
|
||||||
|
|
||||||
def _enforce_limits(self, warcprox_meta):
|
def _enforce_limits(self, warcprox_meta):
|
||||||
"""
|
"""
|
||||||
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
||||||
reached.
|
reached.
|
||||||
"""
|
"""
|
||||||
|
buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
|
||||||
if warcprox_meta and "limits" in warcprox_meta:
|
if warcprox_meta and "limits" in warcprox_meta:
|
||||||
for item in warcprox_meta["limits"].items():
|
for item in warcprox_meta["limits"].items():
|
||||||
limit_key, limit_value = item
|
limit_key, limit_value = item
|
||||||
self._enforce_limit(limit_key, limit_value, soft=False)
|
self._enforce_limit(buckets, limit_key, limit_value, soft=False)
|
||||||
if warcprox_meta and "soft-limits" in warcprox_meta:
|
if warcprox_meta and "soft-limits" in warcprox_meta:
|
||||||
for item in warcprox_meta["soft-limits"].items():
|
for item in warcprox_meta["soft-limits"].items():
|
||||||
limit_key, limit_value = item
|
limit_key, limit_value = item
|
||||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
self._enforce_limit(buckets, limit_key, limit_value, soft=True)
|
||||||
|
|
||||||
def _security_check(self, warcprox_meta):
|
def _security_check(self, warcprox_meta):
|
||||||
'''
|
'''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user