mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
commit
a1356709df
186
README.rst
186
README.rst
@ -1,186 +0,0 @@
|
||||
warcprox - WARC writing MITM HTTP/S proxy
|
||||
-----------------------------------------
|
||||
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/warcprox
|
||||
|
||||
Based on the excellent and simple pymiproxy by Nadeem Douba.
|
||||
https://github.com/allfro/pymiproxy
|
||||
|
||||
Install
|
||||
~~~~~~~
|
||||
|
||||
Warcprox runs on python 3.4+.
|
||||
|
||||
To install latest release run:
|
||||
|
||||
::
|
||||
|
||||
# apt-get install libffi-dev libssl-dev
|
||||
pip install warcprox
|
||||
|
||||
You can also install the latest bleeding edge code:
|
||||
|
||||
::
|
||||
|
||||
pip install git+https://github.com/internetarchive/warcprox.git
|
||||
|
||||
|
||||
Trusting the CA cert
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For best results while browsing through warcprox, you need to add the CA
|
||||
cert as a trusted cert in your browser. If you don't do that, you will
|
||||
get the warning when you visit each new site. But worse, any embedded
|
||||
https content on a different server will simply fail to load, because
|
||||
the browser will reject the certificate without telling you.
|
||||
|
||||
Plugins
|
||||
~~~~~~~
|
||||
|
||||
Warcprox supports a limited notion of plugins by way of the `--plugin` command
|
||||
line argument. Plugin classes are loaded from the regular python module search
|
||||
path. They will be instantiated with one argument, a `warcprox.Options`, which
|
||||
holds the values of all the command line arguments. Legacy plugins with
|
||||
constructors that take no arguments are also supported. Plugins should either
|
||||
have a method `notify(self, recorded_url, records)` or should subclass
|
||||
`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
|
||||
specifying `--plugin` multiples times.
|
||||
|
||||
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
|
||||
|
||||
Usage
|
||||
~~~~~
|
||||
|
||||
::
|
||||
|
||||
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||
[--certs-dir CERTS_DIR] [-d DIRECTORY]
|
||||
[--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
|
||||
[-s ROLLOVER_SIZE]
|
||||
[--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||
[-g DIGEST_ALGORITHM] [--base32]
|
||||
[--method-filter HTTP_METHOD]
|
||||
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
|
||||
[-P PLAYBACK_PORT]
|
||||
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
|
||||
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
|
||||
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
||||
[--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
|
||||
[--version] [-v] [--trace] [-q]
|
||||
|
||||
warcprox - WARC writing MITM HTTP/S proxy
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-p PORT, --port PORT port to listen on (default: 8000)
|
||||
-b ADDRESS, --address ADDRESS
|
||||
address to listen on (default: localhost)
|
||||
-c CACERT, --cacert CACERT
|
||||
CA certificate file; if file does not exist, it
|
||||
will be created (default:
|
||||
./ayutla.monkeybrains.net-warcprox-ca.pem)
|
||||
--certs-dir CERTS_DIR
|
||||
where to store and load generated certificates
|
||||
(default: ./ayutla.monkeybrains.net-warcprox-ca)
|
||||
-d DIRECTORY, --dir DIRECTORY
|
||||
where to write warcs (default: ./warcs)
|
||||
--warc-filename WARC_FILENAME
|
||||
define custom WARC filename with variables
|
||||
{prefix}, {timestamp14}, {timestamp17},
|
||||
{serialno}, {randomtoken}, {hostname},
|
||||
{shorthostname} (default:
|
||||
{prefix}-{timestamp17}-{serialno}-{randomtoken})
|
||||
-z, --gzip write gzip-compressed warc records
|
||||
-n PREFIX, --prefix PREFIX
|
||||
default WARC filename prefix (default: WARCPROX)
|
||||
-s ROLLOVER_SIZE, --size ROLLOVER_SIZE
|
||||
WARC file rollover size threshold in bytes
|
||||
(default: 1000000000)
|
||||
--rollover-idle-time ROLLOVER_IDLE_TIME
|
||||
WARC file rollover idle time threshold in seconds
|
||||
(so that Friday's last open WARC doesn't sit there
|
||||
all weekend waiting for more data) (default: None)
|
||||
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
||||
digest algorithm, one of sha384, sha224, md5,
|
||||
sha256, sha512, sha1 (default: sha1)
|
||||
--base32 write digests in Base32 instead of hex
|
||||
--method-filter HTTP_METHOD
|
||||
only record requests with the given http method(s)
|
||||
(can be used more than once) (default: None)
|
||||
--stats-db-file STATS_DB_FILE
|
||||
persistent statistics database file; empty string
|
||||
or /dev/null disables statistics tracking
|
||||
(default: ./warcprox.sqlite)
|
||||
--rethinkdb-stats-url RETHINKDB_STATS_URL
|
||||
rethinkdb stats table url, e.g. rethinkdb://db0.fo
|
||||
o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
|
||||
ble (default: None)
|
||||
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
||||
port to listen on for instant playback (default:
|
||||
None)
|
||||
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
||||
persistent deduplication database file; empty
|
||||
string or /dev/null disables deduplication
|
||||
(default: ./warcprox.sqlite)
|
||||
--rethinkdb-dedup-url RETHINKDB_DEDUP_URL
|
||||
rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
|
||||
db1.foo.org:38015/my_warcprox_db/my_dedup_table
|
||||
(default: None)
|
||||
--rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
|
||||
rethinkdb big table url (table will be populated
|
||||
with various capture information and is suitable
|
||||
for use as index for playback), e.g. rethinkdb://d
|
||||
b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
|
||||
es (default: None)
|
||||
--rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
|
||||
🐷 url pointing to trough configuration rethinkdb
|
||||
database, e.g. rethinkdb://db0.foo.org,db1.foo.org
|
||||
:38015/trough_configuration (default: None)
|
||||
--cdxserver-dedup CDXSERVER_DEDUP
|
||||
use a CDX Server URL for deduplication; e.g.
|
||||
https://web.archive.org/cdx/search (default: None)
|
||||
--rethinkdb-services-url RETHINKDB_SERVICES_URL
|
||||
rethinkdb service registry table url; if provided,
|
||||
warcprox will create and heartbeat entry for
|
||||
itself (default: None)
|
||||
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
|
||||
host:port of tor socks proxy, used only to connect
|
||||
to .onion sites (default: None)
|
||||
--crawl-log-dir CRAWL_LOG_DIR
|
||||
if specified, write crawl log files in the
|
||||
specified directory; one crawl log is written per
|
||||
warc filename prefix; crawl log format mimics
|
||||
heritrix (default: None)
|
||||
--plugin PLUGIN_CLASS
|
||||
Qualified name of plugin class, e.g.
|
||||
"mypkg.mymod.MyClass". May be used multiple times
|
||||
to register multiple plugins. See README.rst for
|
||||
more information. (default: None)
|
||||
--version show program's version number and exit
|
||||
-v, --verbose
|
||||
--trace
|
||||
-q, --quiet
|
||||
|
||||
License
|
||||
~~~~~~~
|
||||
|
||||
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
||||
GPL.
|
||||
|
||||
* Copyright (C) 2012 Cygnos Corporation
|
||||
* Copyright (C) 2013-2018 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
320
api.rst
Normal file
320
api.rst
Normal file
@ -0,0 +1,320 @@
|
||||
warcprox API
|
||||
************
|
||||
|
||||
Means of interacting with warcprox over http, aside from simply proxying urls.
|
||||
|
||||
.. contents::
|
||||
|
||||
``/status`` url
|
||||
===============
|
||||
|
||||
If warcprox is running at localhost:8000, http://localhost:8000/status returns
|
||||
a json blob with a bunch of status info. For example:
|
||||
|
||||
::
|
||||
|
||||
$ curl -sS http://localhost:8000/status
|
||||
{
|
||||
"rates_5min": {
|
||||
"warc_bytes_per_sec": 0.0,
|
||||
"urls_per_sec": 0.0,
|
||||
"actual_elapsed": 277.2983281612396
|
||||
},
|
||||
"version": "2.4b2.dev174",
|
||||
"load": 0.0,
|
||||
"seconds_behind": 0.0,
|
||||
"threads": 100,
|
||||
"warc_bytes_written": 0,
|
||||
"port": 8000,
|
||||
"postfetch_chain": [
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "SkipFacebookCaptchas"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "BatchTroughLoader"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "WarcWriterProcessor"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "BatchTroughStorer"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "RethinkStatsProcessor"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "CrawlLogger"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "TroughFeed"
|
||||
},
|
||||
{
|
||||
"queued_urls": 0,
|
||||
"processor": "RunningStats"
|
||||
}
|
||||
],
|
||||
"queue_max_size": 500,
|
||||
"role": "warcprox",
|
||||
"queued_urls": 0,
|
||||
"active_requests": 1,
|
||||
"host": "wbgrp-svc405.us.archive.org",
|
||||
"rates_15min": {
|
||||
"warc_bytes_per_sec": 0.0,
|
||||
"urls_per_sec": 0.0,
|
||||
"actual_elapsed": 876.9885368347168
|
||||
},
|
||||
"unaccepted_requests": 0,
|
||||
"urls_processed": 0,
|
||||
"pid": 18841,
|
||||
"address": "127.0.0.1",
|
||||
"rates_1min": {
|
||||
"warc_bytes_per_sec": 0.0,
|
||||
"urls_per_sec": 0.0,
|
||||
"actual_elapsed": 54.92501664161682
|
||||
},
|
||||
"start_time": 1526690353.4060142
|
||||
}
|
||||
|
||||
``WARCPROX_WRITE_RECORD`` http method
|
||||
=====================================
|
||||
|
||||
To make warcprox write an arbitrary warc record you can send it a special
|
||||
request with http method ``WARCPROX_WRITE_RECORD``. The http request must
|
||||
include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
|
||||
Warcprox will use these to populate the warc record. For example::
|
||||
|
||||
$ ncat --crlf 127.0.0.1 8000 <<EOF
|
||||
> WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
|
||||
> WARC-Type: resource
|
||||
> Content-type: text/plain;charset=utf-8
|
||||
> Content-length: 29
|
||||
>
|
||||
> i am a warc record payload!
|
||||
> EOF
|
||||
HTTP/1.0 204 OK
|
||||
Server: BaseHTTP/0.6 Python/3.6.3
|
||||
Date: Tue, 22 May 2018 19:21:02 GMT
|
||||
|
||||
On success warcprox responds with http status 204. For the request above
|
||||
warcprox will write a warc record that looks like this::
|
||||
|
||||
WARC/1.0
|
||||
WARC-Type: resource
|
||||
WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
|
||||
WARC-Date: 2018-05-21T23:33:31Z
|
||||
WARC-Target-URI: special://url/some?thing
|
||||
WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
|
||||
WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
|
||||
Content-Type: text/plain;charset=utf-8
|
||||
Content-Length: 29
|
||||
|
||||
i am a warc record payload!
|
||||
|
||||
``Warcprox-Meta`` http request header
|
||||
=====================================
|
||||
|
||||
``Warcprox-Meta`` is a special http request header that can be used to pass
|
||||
configuration information and metadata with each proxy request to warcprox. The
|
||||
value is a json blob. There are several fields understood by warcprox, and
|
||||
arbitrary additional fields can be included. If warcprox doesn't recognize a
|
||||
field it simply ignores it. Custom fields may be useful for custom warcprox
|
||||
plugins (see `<readme.rst#plugins>`_).
|
||||
|
||||
Warcprox strips the ``warcprox-meta`` header out before sending the request to
|
||||
remote server, and does not write it in the warc request record.
|
||||
|
||||
Brozzler knows about ``warcprox-meta``. For information on configuring
|
||||
it in brozzler, see
|
||||
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
|
||||
``Warcprox-Meta`` is often a very important part of brozzler job configuration.
|
||||
It is the way url and data limits on jobs, seeds, and hosts are implemented,
|
||||
among other things.
|
||||
|
||||
Warcprox-Meta fields
|
||||
--------------------
|
||||
|
||||
``warc-prefix`` (string)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Specifies a warc filename prefix. Warcprox will write the warc record for this
|
||||
capture, if any, to a warc named accordingly.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"warc-prefix": "special-warc"}
|
||||
|
||||
``dedup-bucket`` (string)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Specifies the deduplication bucket. For more information about deduplication
|
||||
see `<readme.rst#deduplication>`_.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
|
||||
|
||||
``blocks`` (list)
|
||||
~~~~~~~~~~~~~~~~~
|
||||
List of url match rules. Url match rules are somewhat described at
|
||||
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
|
||||
and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
|
||||
(TODO: write a better doc and link to it)
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
|
||||
|
||||
If any of the rules match the url being requested, warcprox aborts normal
|
||||
processing and responds with a http ``403``. The http response includes
|
||||
a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
|
||||
which reproduces the value of the match rule that resulted in the block. The
|
||||
presence of the ``warcprox-meta`` response header can be used by the client to
|
||||
distinguish this type of a response from a 403 from the remote site.
|
||||
|
||||
An example::
|
||||
|
||||
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
|
||||
HTTP/1.0 403 Forbidden
|
||||
Server: BaseHTTP/0.6 Python/3.6.3
|
||||
Date: Fri, 25 May 2018 22:46:42 GMT
|
||||
Content-Type: text/plain;charset=utf-8
|
||||
Connection: close
|
||||
Content-Length: 111
|
||||
Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
|
||||
|
||||
request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
|
||||
|
||||
You might be wondering why ``blocks`` is necessary. Why would the warcprox
|
||||
client make a request that it should already know will be blocked by the proxy?
|
||||
The answer is that the request may be initiated somewhere where it's difficult
|
||||
to evaluate the block rules. In particular, this circumstance prevails when the
|
||||
browser controlled by brozzler is requesting images, javascript, css, and so
|
||||
on, embedded in a page.
|
||||
|
||||
``stats`` (dictionary)
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
``stats`` is a dictionary with only one field understood by warcprox,
|
||||
``buckets``. The value of ``buckets`` is a list of strings and/or
|
||||
dictionaries. A string signifies the name of the bucket; a dictionary is
|
||||
expected to have at least an item with key ``bucket`` whose value is the name
|
||||
of the bucket. The other currently recognized key is ``tally-domains``, which
|
||||
if supplied should be a list of domains. This instructs warcprox to
|
||||
additionally tally substats of the given bucket by domain.
|
||||
|
||||
See `<readme.rst#statistics>`_ for more information on statistics kept by
|
||||
warcprox.
|
||||
|
||||
Examples::
|
||||
|
||||
Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
|
||||
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||
|
||||
Domain stats are stored in the stats table under the key
|
||||
``"bucket2:foo.bar.com"`` for the latter example. See the following two
|
||||
sections for more examples. The ``soft-limits`` section has an example of a
|
||||
limit on a domain specified in ``tally-domains``.
|
||||
|
||||
``limits`` (dictionary)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Specifies quantitative limits for warcprox to enforce. The structure of the
|
||||
dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
|
||||
format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
|
||||
further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
|
||||
|
||||
If processing a request would result in exceeding a limit, warcprox aborts
|
||||
normal processing and responds with a http ``420 Reached Limit``. The http
|
||||
response includes a ``Warcprox-Meta`` response header with the complete set
|
||||
of statistics for the bucket whose limit has been reached.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
|
||||
|
||||
::
|
||||
|
||||
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
|
||||
HTTP/1.0 420 Reached limit
|
||||
Server: BaseHTTP/0.6 Python/3.6.3
|
||||
Date: Fri, 25 May 2018 23:08:32 GMT
|
||||
Content-Type: text/plain;charset=utf-8
|
||||
Connection: close
|
||||
Content-Length: 77
|
||||
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
|
||||
|
||||
request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
|
||||
|
||||
``soft-limits`` (dictionary)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
From warcprox's perspective ``soft-limits`` work almost exactly the same way
|
||||
as ``limits``. The only difference is that when a soft limit is hit, warcprox
|
||||
response with an http ``430 Reached soft limit`` instead of http ``420``.
|
||||
|
||||
Warcprox clients might treat a ``430`` very differently from a ``420``. From
|
||||
brozzler's perspective, for instance, ``soft-limits`` are very different from
|
||||
``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
|
||||
has been reached, this means that crawling for that seed is finished, and
|
||||
brozzler sets about finalizing the crawl of that seed. On the other hand,
|
||||
brozzler blissfully ignores ``430`` responses, because soft limits only apply
|
||||
to a particular bucket (like a domain), and don't have any effect on crawling
|
||||
of urls that don't fall in that bucket.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
|
||||
|
||||
::
|
||||
|
||||
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
|
||||
HTTP/1.0 430 Reached soft limit
|
||||
Server: BaseHTTP/0.6 Python/3.6.3
|
||||
Date: Fri, 25 May 2018 23:12:06 GMT
|
||||
Content-Type: text/plain;charset=utf-8
|
||||
Connection: close
|
||||
Content-Length: 82
|
||||
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
|
||||
|
||||
request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
|
||||
|
||||
``metadata`` (dictionary)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
|
||||
that if it has a ``seed`` entry and crawl logs are enabled via the
|
||||
``--crawl-log-dir`` command line option, the value of ``seed`` is written to
|
||||
the crawl log as the 11th field on the line, simulating heritrix's "source
|
||||
tag".
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
|
||||
|
||||
``accept`` (list)
|
||||
~~~~~~~~~~~~~~~~~
|
||||
Specifies fields that the client would like to receive in the ``Warcprox-Meta``
|
||||
response header. Only one value is currently understood,
|
||||
``capture-metadata``.
|
||||
|
||||
Example::
|
||||
|
||||
Warcprox-Meta: {"accept": ["capture-metadata"]}
|
||||
|
||||
The response will include a ``Warcprox-Meta`` response header with one field
|
||||
also called ``captured-metadata``. Currently warcprox reports one piece of
|
||||
capture medata, ``timestamp``, which represents the time fetch began for the
|
||||
resource and matches the ``WARC-Date`` written to the warc record. For
|
||||
example::
|
||||
|
||||
Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
|
||||
|
||||
``Warcprox-Meta`` http response header
|
||||
======================================
|
||||
In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
|
||||
that it sends to the client. As with the request header, the value is a json
|
||||
blob. It is only included if something in the ``warcprox-meta`` request header
|
||||
calls for it. Those cases are described above in the `Warcprox-Meta http
|
||||
request header`_ section.
|
||||
|
173
readme.rst
Normal file
173
readme.rst
Normal file
@ -0,0 +1,173 @@
|
||||
Warcprox - WARC writing MITM HTTP/S proxy
|
||||
*****************************************
|
||||
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/warcprox
|
||||
|
||||
Warcprox is a tool for archiving the web. It is an http proxy that stores its
|
||||
traffic to disk in `WARC
|
||||
<https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
|
||||
format. Warcprox captures encrypted https traffic by using the
|
||||
`"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
|
||||
technique (see the `Man-in-the-middle`_ section for more info).
|
||||
|
||||
The web pages that warcprox stores in WARC files can be played back using
|
||||
software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
|
||||
<https://github.com/webrecorder/pywb>`_. Warcprox has been developed in
|
||||
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ and
|
||||
together they make a comprehensive modern distributed archival web crawling
|
||||
system.
|
||||
|
||||
Warcprox was originally based on the excellent and simple pymiproxy by Nadeem
|
||||
Douba. https://github.com/allfro/pymiproxy
|
||||
|
||||
.. contents::
|
||||
|
||||
Getting started
|
||||
===============
|
||||
Warcprox runs on python 3.4+.
|
||||
|
||||
To install latest release run::
|
||||
|
||||
# apt-get install libffi-dev libssl-dev
|
||||
pip install warcprox
|
||||
|
||||
You can also install the latest bleeding edge code::
|
||||
|
||||
pip install git+https://github.com/internetarchive/warcprox.git
|
||||
|
||||
To start warcprox run::
|
||||
|
||||
warcprox
|
||||
|
||||
Try ``warcprox --help`` for documentation on command line options.
|
||||
|
||||
Man-in-the-middle
|
||||
=================
|
||||
Normally, http proxies can't read https traffic, because it's encrypted. The
|
||||
browser uses the http ``CONNECT`` method to establish a tunnel through the
|
||||
proxy, and the proxy merely routes raw bytes between the client and server.
|
||||
Since the bytes are encrypted, the proxy can't make sense of the information
|
||||
it's proxying. This nonsensical encrypted data would not be very useful to
|
||||
archive.
|
||||
|
||||
In order to capture https traffic, warcprox acts as a "man-in-the-middle"
|
||||
(MITM). When it receives a ``CONNECT`` directive from a client, it generates a
|
||||
public key certificate for the requested site, presents to the client, and
|
||||
proceeds to establish an encrypted connection with the client. Then it makes a
|
||||
separate, normal https connection to the remote site. It decrypts, archives,
|
||||
and re-encrypts traffic in both directions.
|
||||
|
||||
Although "man-in-the-middle" is often paired with "attack", there is nothing
|
||||
malicious about what warcprox is doing. If you configure an instance of
|
||||
warcprox as your browser's http proxy, you will see lots of certificate
|
||||
warnings, since none of the certificates will be signed by trusted authorities.
|
||||
To use warcprox effectively the client needs to disable certificate
|
||||
verification, or add the CA cert generated by warcprox as a trusted authority.
|
||||
(If you do this in your browser, make sure you undo it when you're done using
|
||||
warcprox!)
|
||||
|
||||
API
|
||||
===
|
||||
For interacting with a running instance of warcprox.
|
||||
|
||||
* ``/status`` url
|
||||
* ``WARCPROX_WRITE_RECORD`` http method
|
||||
* ``Warcprox-Meta`` http request header and response header
|
||||
|
||||
See `<api.rst>`_.
|
||||
|
||||
Deduplication
|
||||
=============
|
||||
Warcprox avoids archiving redundant content by "deduplicating" it. The process
|
||||
for deduplication works similarly to heritrix and other web archiving tools.
|
||||
|
||||
1. while fetching url, calculate payload content digest (typically sha1)
|
||||
2. look up digest in deduplication database (warcprox supports a few different
|
||||
ones)
|
||||
3. if found, write warc ``revisit`` record referencing the url and capture time
|
||||
of the previous capture
|
||||
4. else (if not found),
|
||||
|
||||
a. write warc ``response`` record with full payload
|
||||
b. store entry in deduplication database
|
||||
|
||||
The dedup database is partitioned into different "buckets". Urls are
|
||||
deduplicated only against other captures in the same bucket. If specified, the
|
||||
``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines
|
||||
the bucket, otherwise the default bucket is used.
|
||||
|
||||
Deduplication can be disabled entirely by starting warcprox with the argument
|
||||
``--dedup-db-file=/dev/null``.
|
||||
|
||||
Statistics
|
||||
==========
|
||||
Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
|
||||
These are consulted for enforcing ``limits`` and ``soft-limits`` (see
|
||||
`<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
|
||||
processes outside of warcprox, for reporting etc.
|
||||
|
||||
Statistics are grouped by "bucket". Every capture is counted as part of the
|
||||
``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
|
||||
request header. The fallback bucket in case none is specified is called
|
||||
``__unspecified__``.
|
||||
|
||||
Within each bucket are three sub-buckets:
|
||||
|
||||
* ``new`` - tallies captures for which a complete record (usually a ``response``
|
||||
record) was written to warc
|
||||
* ``revisit`` - tallies captures for which a ``revisit`` record was written to
|
||||
warc
|
||||
* ``total`` - includes all urls processed, even those not written to warc (so the
|
||||
numbers may be greater than new + revisit)
|
||||
|
||||
Within each of these sub-buckets we keep two statistics:
|
||||
|
||||
* ``urls`` - simple count of urls
|
||||
* ``wire_bytes`` - sum of bytes received over the wire, including http headers,
|
||||
from the remote server for each url
|
||||
|
||||
For historical reasons, in sqlite, the default store, statistics are kept as
|
||||
json blobs::
|
||||
|
||||
sqlite> select * from buckets_of_stats;
|
||||
bucket stats
|
||||
--------------- ---------------------------------------------------------------------------------------------
|
||||
__unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
|
||||
__all__ {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
|
||||
|
||||
Plugins
|
||||
=======
|
||||
Warcprox supports a limited notion of plugins by way of the ``--plugin``
|
||||
command line argument. Plugin classes are loaded from the regular python module
|
||||
search path. They will be instantiated with one argument, a
|
||||
``warcprox.Options``, which holds the values of all the command line arguments.
|
||||
Legacy plugins with constructors that take no arguments are also supported.
|
||||
Plugins should either have a method ``notify(self, recorded_url, records)`` or
|
||||
should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can
|
||||
be configured by specifying ``--plugin`` multiples times.
|
||||
|
||||
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
|
||||
|
||||
License
|
||||
=======
|
||||
|
||||
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
||||
GPL.
|
||||
|
||||
* Copyright (C) 2012 Cygnos Corporation
|
||||
* Copyright (C) 2013-2018 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
2
setup.py
2
setup.py
@ -45,7 +45,7 @@ setuptools.setup(
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst').read(),
|
||||
long_description=open('readme.rst').read(),
|
||||
license='GPL',
|
||||
packages=['warcprox'],
|
||||
install_requires=deps,
|
||||
|
@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
|
||||
|
||||
# next fetch hits the limit
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
@ -717,6 +718,17 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
|
||||
|
||||
# make sure limit doesn't get applied to a different stats bucket
|
||||
request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'i!'
|
||||
assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11)
|
||||
|
||||
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
@ -726,14 +738,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['Warcprox-Meta']
|
||||
data = json.loads(response.headers['Warcprox-Meta'])
|
||||
assert data['capture-metadata']
|
||||
response_meta = json.loads(response.headers['Warcprox-Meta'])
|
||||
assert response_meta['capture-metadata']
|
||||
try:
|
||||
dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
|
||||
dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'],
|
||||
'%Y-%m-%dT%H:%M:%SZ')
|
||||
assert dt
|
||||
except ValueError:
|
||||
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
|
||||
pytest.fail(
|
||||
'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r',
|
||||
meta['capture-metadata']['timestamp'])
|
||||
|
||||
# wait for postfetch chain (or subsequent test could fail)
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
@ -997,6 +1011,7 @@ def test_domain_doc_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
# ** comment is obsolete (server is multithreaded) but still useful **
|
||||
# we need to clear the connection pool here because
|
||||
# - connection pool already may already have an open connection localhost
|
||||
# - we're about to make a connection to foo.localhost
|
||||
@ -1132,6 +1147,23 @@ def test_domain_doc_soft_limit(
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||
|
||||
# make sure soft limit doesn't get applied to a different stats bucket
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]},
|
||||
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'o!'
|
||||
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
|
||||
|
||||
def test_domain_data_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
@ -1226,6 +1258,22 @@ def test_domain_data_soft_limit(
|
||||
### assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
|
||||
|
||||
# make sure soft limit doesn't get applied to a different stats bucket
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]},
|
||||
"soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'y!'
|
||||
assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
||||
|
||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||
# connection to the internet, and relies on a third party site (facebook) being
|
||||
# up and behaving a certain way
|
||||
|
@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'):
|
||||
action='append', help=(
|
||||
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
|
||||
'May be used multiple times to register multiple plugins. '
|
||||
'See README.rst for more information.'))
|
||||
'See readme.rst for more information.'))
|
||||
arg_parser.add_argument('--version', action='version',
|
||||
version="warcprox {}".format(warcprox.__version__))
|
||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
|
@ -53,6 +53,53 @@ def _empty_bucket(bucket):
|
||||
},
|
||||
}
|
||||
|
||||
def unravel_buckets(url, warcprox_meta):
|
||||
'''
|
||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Returns:
|
||||
list of strings
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||
|
||||
In this case the return value would be
|
||||
["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (warcprox_meta and "stats" in warcprox_meta
|
||||
and "buckets" in warcprox_meta["stats"]):
|
||||
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
canon_url = urlcanon.semantic(url)
|
||||
for domain in bucket['tally-domains']:
|
||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||
if urlcanon.url_matches_domain(canon_url, domain):
|
||||
buckets.append(
|
||||
'%s:%s' % (bucket['bucket'], domain))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
buckets.append("__unspecified__")
|
||||
|
||||
return buckets
|
||||
|
||||
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||
logger = logging.getLogger("warcprox.stats.StatsProcessor")
|
||||
|
||||
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||
return None
|
||||
|
||||
def buckets(self, recorded_url):
|
||||
'''
|
||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (recorded_url.warcprox_meta
|
||||
and "stats" in recorded_url.warcprox_meta
|
||||
and "buckets" in recorded_url.warcprox_meta["stats"]):
|
||||
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
url = urlcanon.semantic(recorded_url.url)
|
||||
for domain in bucket['tally-domains']:
|
||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||
if urlcanon.url_matches_domain(url, domain):
|
||||
buckets.append(
|
||||
'%s:%s' % (bucket['bucket'], domain))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
buckets.append("__unspecified__")
|
||||
|
||||
return buckets
|
||||
return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
|
||||
|
||||
class RethinkStatsProcessor(StatsProcessor):
|
||||
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
|
||||
@ -301,11 +309,9 @@ class RunningStats:
|
||||
need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
|
||||
if need_minute_snap:
|
||||
self.minute_snaps.append((now, self.urls, self.warc_bytes))
|
||||
logging.debug('added minute snap %r', self.minute_snaps[-1])
|
||||
if need_ten_sec_snap:
|
||||
self.ten_sec_snaps.popleft()
|
||||
self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
|
||||
logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1])
|
||||
|
||||
def _closest_ten_sec_snap(self, t):
|
||||
# it's a deque so iterating over it is faster than indexed lookup
|
||||
|
@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
block_rule = urlcanon.MatchRule(**rule)
|
||||
if block_rule.applies(url):
|
||||
body = ("request rejected by warcprox: blocked by "
|
||||
"rule found in Warcprox-Meta header: %s"
|
||||
% rule).encode("utf-8")
|
||||
"rule found in Warcprox-Meta header: %s\n"
|
||||
% json.dumps(rule)).encode("utf-8")
|
||||
self.send_response(403, "Forbidden")
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Content-Length", len(body))
|
||||
response_meta = {"blocked-by-rule":rule}
|
||||
response_meta = {"blocked-by-rule": rule}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.client_address[0], self.command,
|
||||
self.url, rule))
|
||||
|
||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
|
||||
if not self.server.stats_db:
|
||||
return
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
_limit_key = limit_key
|
||||
|
||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||
# to apply this rule if the requested url is within domain
|
||||
bucket0_fields = bucket0.split(':')
|
||||
if len(bucket0_fields) == 2:
|
||||
domain = urlcanon.normalize_host(bucket0_fields[1])
|
||||
if not urlcanon.host_matches_domain(self.hostname, domain):
|
||||
return # else host matches, go ahead and enforce the limit
|
||||
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
|
||||
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
||||
# parse limit key
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
# normalize domain if part of bucket
|
||||
if ":" in bucket0:
|
||||
b, raw_domain = bucket0.split(":", 1)
|
||||
domain = urlcanon.normalize_host(raw_domain).decode("ascii")
|
||||
bucket0 = "%s:%s" % (b, domain)
|
||||
limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
|
||||
|
||||
if not bucket0 in buckets:
|
||||
return
|
||||
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
if value and limit_value and limit_value > 0 and value >= limit_value:
|
||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||
"soft limit" if soft else "limit", _limit_key,
|
||||
"soft limit" if soft else "limit", limit_key,
|
||||
limit_value)).encode("utf-8")
|
||||
if soft:
|
||||
self.send_response(430, "Reached soft limit")
|
||||
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||
}
|
||||
if soft:
|
||||
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
|
||||
response_meta["reached-soft-limit"] = {limit_key:limit_value}
|
||||
else:
|
||||
response_meta["reached-limit"] = {_limit_key:limit_value}
|
||||
response_meta["reached-limit"] = {limit_key:limit_value}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
"Warcprox-Meta", json.dumps(response_meta, separators=",:"))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.client_address[0], 430 if soft else 420,
|
||||
self.command, self.url,
|
||||
"soft limit" if soft else "limit",
|
||||
_limit_key, limit_value))
|
||||
limit_key, limit_value))
|
||||
|
||||
def _enforce_limits(self, warcprox_meta):
|
||||
"""
|
||||
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
||||
reached.
|
||||
"""
|
||||
buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
|
||||
if warcprox_meta and "limits" in warcprox_meta:
|
||||
for item in warcprox_meta["limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=False)
|
||||
self._enforce_limit(buckets, limit_key, limit_value, soft=False)
|
||||
if warcprox_meta and "soft-limits" in warcprox_meta:
|
||||
for item in warcprox_meta["soft-limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
self._enforce_limit(buckets, limit_key, limit_value, soft=True)
|
||||
|
||||
def _security_check(self, warcprox_meta):
|
||||
'''
|
||||
|
Loading…
x
Reference in New Issue
Block a user