Merge pull request #93 from nlevitt/docs

docs
This commit is contained in:
Noah Levitt 2018-05-30 15:57:50 -07:00 committed by GitHub
commit a1356709df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 618 additions and 257 deletions

View File

@ -1,186 +0,0 @@
warcprox - WARC writing MITM HTTP/S proxy
-----------------------------------------
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
:target: https://travis-ci.org/internetarchive/warcprox
Based on the excellent and simple pymiproxy by Nadeem Douba.
https://github.com/allfro/pymiproxy
Install
~~~~~~~
Warcprox runs on python 3.4+.
To install latest release run:
::
# apt-get install libffi-dev libssl-dev
pip install warcprox
You can also install the latest bleeding edge code:
::
pip install git+https://github.com/internetarchive/warcprox.git
Trusting the CA cert
~~~~~~~~~~~~~~~~~~~~
For best results while browsing through warcprox, you need to add the CA
cert as a trusted cert in your browser. If you don't do that, you will
get the warning when you visit each new site. But worse, any embedded
https content on a different server will simply fail to load, because
the browser will reject the certificate without telling you.
Plugins
~~~~~~~
Warcprox supports a limited notion of plugins by way of the `--plugin` command
line argument. Plugin classes are loaded from the regular python module search
path. They will be instantiated with one argument, a `warcprox.Options`, which
holds the values of all the command line arguments. Legacy plugins with
constructors that take no arguments are also supported. Plugins should either
have a method `notify(self, recorded_url, records)` or should subclass
`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
specifying `--plugin` multiples times.
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
Usage
~~~~~
::
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
[--certs-dir CERTS_DIR] [-d DIRECTORY]
[--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
[-s ROLLOVER_SIZE]
[--rollover-idle-time ROLLOVER_IDLE_TIME]
[-g DIGEST_ALGORITHM] [--base32]
[--method-filter HTTP_METHOD]
[--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
[-P PLAYBACK_PORT]
[-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
[--rethinkdb-services-url RETHINKDB_SERVICES_URL]
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
[--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
[--version] [-v] [--trace] [-q]
warcprox - WARC writing MITM HTTP/S proxy
optional arguments:
-h, --help show this help message and exit
-p PORT, --port PORT port to listen on (default: 8000)
-b ADDRESS, --address ADDRESS
address to listen on (default: localhost)
-c CACERT, --cacert CACERT
CA certificate file; if file does not exist, it
will be created (default:
./ayutla.monkeybrains.net-warcprox-ca.pem)
--certs-dir CERTS_DIR
where to store and load generated certificates
(default: ./ayutla.monkeybrains.net-warcprox-ca)
-d DIRECTORY, --dir DIRECTORY
where to write warcs (default: ./warcs)
--warc-filename WARC_FILENAME
define custom WARC filename with variables
{prefix}, {timestamp14}, {timestamp17},
{serialno}, {randomtoken}, {hostname},
{shorthostname} (default:
{prefix}-{timestamp17}-{serialno}-{randomtoken})
-z, --gzip write gzip-compressed warc records
-n PREFIX, --prefix PREFIX
default WARC filename prefix (default: WARCPROX)
-s ROLLOVER_SIZE, --size ROLLOVER_SIZE
WARC file rollover size threshold in bytes
(default: 1000000000)
--rollover-idle-time ROLLOVER_IDLE_TIME
WARC file rollover idle time threshold in seconds
(so that Friday's last open WARC doesn't sit there
all weekend waiting for more data) (default: None)
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
digest algorithm, one of sha384, sha224, md5,
sha256, sha512, sha1 (default: sha1)
--base32 write digests in Base32 instead of hex
--method-filter HTTP_METHOD
only record requests with the given http method(s)
(can be used more than once) (default: None)
--stats-db-file STATS_DB_FILE
persistent statistics database file; empty string
or /dev/null disables statistics tracking
(default: ./warcprox.sqlite)
--rethinkdb-stats-url RETHINKDB_STATS_URL
rethinkdb stats table url, e.g. rethinkdb://db0.fo
o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
ble (default: None)
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
port to listen on for instant playback (default:
None)
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
persistent deduplication database file; empty
string or /dev/null disables deduplication
(default: ./warcprox.sqlite)
--rethinkdb-dedup-url RETHINKDB_DEDUP_URL
rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
db1.foo.org:38015/my_warcprox_db/my_dedup_table
(default: None)
--rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
rethinkdb big table url (table will be populated
with various capture information and is suitable
for use as index for playback), e.g. rethinkdb://d
b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
es (default: None)
--rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
🐷 url pointing to trough configuration rethinkdb
database, e.g. rethinkdb://db0.foo.org,db1.foo.org
:38015/trough_configuration (default: None)
--cdxserver-dedup CDXSERVER_DEDUP
use a CDX Server URL for deduplication; e.g.
https://web.archive.org/cdx/search (default: None)
--rethinkdb-services-url RETHINKDB_SERVICES_URL
rethinkdb service registry table url; if provided,
warcprox will create and heartbeat entry for
itself (default: None)
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
host:port of tor socks proxy, used only to connect
to .onion sites (default: None)
--crawl-log-dir CRAWL_LOG_DIR
if specified, write crawl log files in the
specified directory; one crawl log is written per
warc filename prefix; crawl log format mimics
heritrix (default: None)
--plugin PLUGIN_CLASS
Qualified name of plugin class, e.g.
"mypkg.mymod.MyClass". May be used multiple times
to register multiple plugins. See README.rst for
more information. (default: None)
--version show program's version number and exit
-v, --verbose
--trace
-q, --quiet
License
~~~~~~~
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
GPL.
* Copyright (C) 2012 Cygnos Corporation
* Copyright (C) 2013-2018 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

320
api.rst Normal file
View File

@ -0,0 +1,320 @@
warcprox API
************
Means of interacting with warcprox over http, aside from simply proxying urls.
.. contents::
``/status`` url
===============
If warcprox is running at localhost:8000, http://localhost:8000/status returns
a json blob with a bunch of status info. For example:
::
$ curl -sS http://localhost:8000/status
{
"rates_5min": {
"warc_bytes_per_sec": 0.0,
"urls_per_sec": 0.0,
"actual_elapsed": 277.2983281612396
},
"version": "2.4b2.dev174",
"load": 0.0,
"seconds_behind": 0.0,
"threads": 100,
"warc_bytes_written": 0,
"port": 8000,
"postfetch_chain": [
{
"queued_urls": 0,
"processor": "SkipFacebookCaptchas"
},
{
"queued_urls": 0,
"processor": "BatchTroughLoader"
},
{
"queued_urls": 0,
"processor": "WarcWriterProcessor"
},
{
"queued_urls": 0,
"processor": "BatchTroughStorer"
},
{
"queued_urls": 0,
"processor": "RethinkStatsProcessor"
},
{
"queued_urls": 0,
"processor": "CrawlLogger"
},
{
"queued_urls": 0,
"processor": "TroughFeed"
},
{
"queued_urls": 0,
"processor": "RunningStats"
}
],
"queue_max_size": 500,
"role": "warcprox",
"queued_urls": 0,
"active_requests": 1,
"host": "wbgrp-svc405.us.archive.org",
"rates_15min": {
"warc_bytes_per_sec": 0.0,
"urls_per_sec": 0.0,
"actual_elapsed": 876.9885368347168
},
"unaccepted_requests": 0,
"urls_processed": 0,
"pid": 18841,
"address": "127.0.0.1",
"rates_1min": {
"warc_bytes_per_sec": 0.0,
"urls_per_sec": 0.0,
"actual_elapsed": 54.92501664161682
},
"start_time": 1526690353.4060142
}
``WARCPROX_WRITE_RECORD`` http method
=====================================
To make warcprox write an arbitrary warc record you can send it a special
request with http method ``WARCPROX_WRITE_RECORD``. The http request must
include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
Warcprox will use these to populate the warc record. For example::
$ ncat --crlf 127.0.0.1 8000 <<EOF
> WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
> WARC-Type: resource
> Content-type: text/plain;charset=utf-8
> Content-length: 29
>
> i am a warc record payload!
> EOF
HTTP/1.0 204 OK
Server: BaseHTTP/0.6 Python/3.6.3
Date: Tue, 22 May 2018 19:21:02 GMT
On success warcprox responds with http status 204. For the request above
warcprox will write a warc record that looks like this::
WARC/1.0
WARC-Type: resource
WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
WARC-Date: 2018-05-21T23:33:31Z
WARC-Target-URI: special://url/some?thing
WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
Content-Type: text/plain;charset=utf-8
Content-Length: 29
i am a warc record payload!
``Warcprox-Meta`` http request header
=====================================
``Warcprox-Meta`` is a special http request header that can be used to pass
configuration information and metadata with each proxy request to warcprox. The
value is a json blob. There are several fields understood by warcprox, and
arbitrary additional fields can be included. If warcprox doesn't recognize a
field it simply ignores it. Custom fields may be useful for custom warcprox
plugins (see `<readme.rst#plugins>`_).
Warcprox strips the ``warcprox-meta`` header out before sending the request to
remote server, and does not write it in the warc request record.
Brozzler knows about ``warcprox-meta``. For information on configuring
it in brozzler, see
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
``Warcprox-Meta`` is often a very important part of brozzler job configuration.
It is the way url and data limits on jobs, seeds, and hosts are implemented,
among other things.
Warcprox-Meta fields
--------------------
``warc-prefix`` (string)
~~~~~~~~~~~~~~~~~~~~~~~~
Specifies a warc filename prefix. Warcprox will write the warc record for this
capture, if any, to a warc named accordingly.
Example::
Warcprox-Meta: {"warc-prefix": "special-warc"}
``dedup-bucket`` (string)
~~~~~~~~~~~~~~~~~~~~~~~~~
Specifies the deduplication bucket. For more information about deduplication
see `<readme.rst#deduplication>`_.
Example::
Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
``blocks`` (list)
~~~~~~~~~~~~~~~~~
List of url match rules. Url match rules are somewhat described at
https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
(TODO: write a better doc and link to it)
Example::
Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
If any of the rules match the url being requested, warcprox aborts normal
processing and responds with a http ``403``. The http response includes
a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
which reproduces the value of the match rule that resulted in the block. The
presence of the ``warcprox-meta`` response header can be used by the client to
distinguish this type of a response from a 403 from the remote site.
An example::
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
HTTP/1.0 403 Forbidden
Server: BaseHTTP/0.6 Python/3.6.3
Date: Fri, 25 May 2018 22:46:42 GMT
Content-Type: text/plain;charset=utf-8
Connection: close
Content-Length: 111
Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
You might be wondering why ``blocks`` is necessary. Why would the warcprox
client make a request that it should already know will be blocked by the proxy?
The answer is that the request may be initiated somewhere where it's difficult
to evaluate the block rules. In particular, this circumstance prevails when the
browser controlled by brozzler is requesting images, javascript, css, and so
on, embedded in a page.
``stats`` (dictionary)
~~~~~~~~~~~~~~~~~~~~~~
``stats`` is a dictionary with only one field understood by warcprox,
``buckets``. The value of ``buckets`` is a list of strings and/or
dictionaries. A string signifies the name of the bucket; a dictionary is
expected to have at least an item with key ``bucket`` whose value is the name
of the bucket. The other currently recognized key is ``tally-domains``, which
if supplied should be a list of domains. This instructs warcprox to
additionally tally substats of the given bucket by domain.
See `<readme.rst#statistics>`_ for more information on statistics kept by
warcprox.
Examples::
Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
Domain stats are stored in the stats table under the key
``"bucket2:foo.bar.com"`` for the latter example. See the following two
sections for more examples. The ``soft-limits`` section has an example of a
limit on a domain specified in ``tally-domains``.
``limits`` (dictionary)
~~~~~~~~~~~~~~~~~~~~~~~
Specifies quantitative limits for warcprox to enforce. The structure of the
dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
If processing a request would result in exceeding a limit, warcprox aborts
normal processing and responds with a http ``420 Reached Limit``. The http
response includes a ``Warcprox-Meta`` response header with the complete set
of statistics for the bucket whose limit has been reached.
Example::
Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
::
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
HTTP/1.0 420 Reached limit
Server: BaseHTTP/0.6 Python/3.6.3
Date: Fri, 25 May 2018 23:08:32 GMT
Content-Type: text/plain;charset=utf-8
Connection: close
Content-Length: 77
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
``soft-limits`` (dictionary)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
From warcprox's perspective ``soft-limits`` work almost exactly the same way
as ``limits``. The only difference is that when a soft limit is hit, warcprox
response with an http ``430 Reached soft limit`` instead of http ``420``.
Warcprox clients might treat a ``430`` very differently from a ``420``. From
brozzler's perspective, for instance, ``soft-limits`` are very different from
``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
has been reached, this means that crawling for that seed is finished, and
brozzler sets about finalizing the crawl of that seed. On the other hand,
brozzler blissfully ignores ``430`` responses, because soft limits only apply
to a particular bucket (like a domain), and don't have any effect on crawling
of urls that don't fall in that bucket.
Example::
Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
::
$ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
HTTP/1.0 430 Reached soft limit
Server: BaseHTTP/0.6 Python/3.6.3
Date: Fri, 25 May 2018 23:12:06 GMT
Content-Type: text/plain;charset=utf-8
Connection: close
Content-Length: 82
Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
``metadata`` (dictionary)
~~~~~~~~~~~~~~~~~~~~~~~~~
An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
that if it has a ``seed`` entry and crawl logs are enabled via the
``--crawl-log-dir`` command line option, the value of ``seed`` is written to
the crawl log as the 11th field on the line, simulating heritrix's "source
tag".
Example::
Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
``accept`` (list)
~~~~~~~~~~~~~~~~~
Specifies fields that the client would like to receive in the ``Warcprox-Meta``
response header. Only one value is currently understood,
``capture-metadata``.
Example::
Warcprox-Meta: {"accept": ["capture-metadata"]}
The response will include a ``Warcprox-Meta`` response header with one field
also called ``captured-metadata``. Currently warcprox reports one piece of
capture medata, ``timestamp``, which represents the time fetch began for the
resource and matches the ``WARC-Date`` written to the warc record. For
example::
Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
``Warcprox-Meta`` http response header
======================================
In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
that it sends to the client. As with the request header, the value is a json
blob. It is only included if something in the ``warcprox-meta`` request header
calls for it. Those cases are described above in the `Warcprox-Meta http
request header`_ section.

173
readme.rst Normal file
View File

@ -0,0 +1,173 @@
Warcprox - WARC writing MITM HTTP/S proxy
*****************************************
.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
:target: https://travis-ci.org/internetarchive/warcprox
Warcprox is a tool for archiving the web. It is an http proxy that stores its
traffic to disk in `WARC
<https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
format. Warcprox captures encrypted https traffic by using the
`"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
technique (see the `Man-in-the-middle`_ section for more info).
The web pages that warcprox stores in WARC files can be played back using
software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
<https://github.com/webrecorder/pywb>`_. Warcprox has been developed in
parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ and
together they make a comprehensive modern distributed archival web crawling
system.
Warcprox was originally based on the excellent and simple pymiproxy by Nadeem
Douba. https://github.com/allfro/pymiproxy
.. contents::
Getting started
===============
Warcprox runs on python 3.4+.
To install latest release run::
# apt-get install libffi-dev libssl-dev
pip install warcprox
You can also install the latest bleeding edge code::
pip install git+https://github.com/internetarchive/warcprox.git
To start warcprox run::
warcprox
Try ``warcprox --help`` for documentation on command line options.
Man-in-the-middle
=================
Normally, http proxies can't read https traffic, because it's encrypted. The
browser uses the http ``CONNECT`` method to establish a tunnel through the
proxy, and the proxy merely routes raw bytes between the client and server.
Since the bytes are encrypted, the proxy can't make sense of the information
it's proxying. This nonsensical encrypted data would not be very useful to
archive.
In order to capture https traffic, warcprox acts as a "man-in-the-middle"
(MITM). When it receives a ``CONNECT`` directive from a client, it generates a
public key certificate for the requested site, presents to the client, and
proceeds to establish an encrypted connection with the client. Then it makes a
separate, normal https connection to the remote site. It decrypts, archives,
and re-encrypts traffic in both directions.
Although "man-in-the-middle" is often paired with "attack", there is nothing
malicious about what warcprox is doing. If you configure an instance of
warcprox as your browser's http proxy, you will see lots of certificate
warnings, since none of the certificates will be signed by trusted authorities.
To use warcprox effectively the client needs to disable certificate
verification, or add the CA cert generated by warcprox as a trusted authority.
(If you do this in your browser, make sure you undo it when you're done using
warcprox!)
API
===
For interacting with a running instance of warcprox.
* ``/status`` url
* ``WARCPROX_WRITE_RECORD`` http method
* ``Warcprox-Meta`` http request header and response header
See `<api.rst>`_.
Deduplication
=============
Warcprox avoids archiving redundant content by "deduplicating" it. The process
for deduplication works similarly to heritrix and other web archiving tools.
1. while fetching url, calculate payload content digest (typically sha1)
2. look up digest in deduplication database (warcprox supports a few different
ones)
3. if found, write warc ``revisit`` record referencing the url and capture time
of the previous capture
4. else (if not found),
a. write warc ``response`` record with full payload
b. store entry in deduplication database
The dedup database is partitioned into different "buckets". Urls are
deduplicated only against other captures in the same bucket. If specified, the
``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines
the bucket, otherwise the default bucket is used.
Deduplication can be disabled entirely by starting warcprox with the argument
``--dedup-db-file=/dev/null``.
Statistics
==========
Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
These are consulted for enforcing ``limits`` and ``soft-limits`` (see
`<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
processes outside of warcprox, for reporting etc.
Statistics are grouped by "bucket". Every capture is counted as part of the
``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
request header. The fallback bucket in case none is specified is called
``__unspecified__``.
Within each bucket are three sub-buckets:
* ``new`` - tallies captures for which a complete record (usually a ``response``
record) was written to warc
* ``revisit`` - tallies captures for which a ``revisit`` record was written to
warc
* ``total`` - includes all urls processed, even those not written to warc (so the
numbers may be greater than new + revisit)
Within each of these sub-buckets we keep two statistics:
* ``urls`` - simple count of urls
* ``wire_bytes`` - sum of bytes received over the wire, including http headers,
from the remote server for each url
For historical reasons, in sqlite, the default store, statistics are kept as
json blobs::
sqlite> select * from buckets_of_stats;
bucket stats
--------------- ---------------------------------------------------------------------------------------------
__unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
__all__ {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
Plugins
=======
Warcprox supports a limited notion of plugins by way of the ``--plugin``
command line argument. Plugin classes are loaded from the regular python module
search path. They will be instantiated with one argument, a
``warcprox.Options``, which holds the values of all the command line arguments.
Legacy plugins with constructors that take no arguments are also supported.
Plugins should either have a method ``notify(self, recorded_url, records)`` or
should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can
be configured by specifying ``--plugin`` multiples times.
`A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
License
=======
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
GPL.
* Copyright (C) 2012 Cygnos Corporation
* Copyright (C) 2013-2018 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

View File

@ -45,7 +45,7 @@ setuptools.setup(
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',
author_email='nlevitt@archive.org',
long_description=open('README.rst').read(),
long_description=open('readme.rst').read(),
license='GPL',
packages=['warcprox'],
install_requires=deps,

View File

@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
# next fetch hits the limit
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 420
assert response.reason == "Reached limit"
@ -717,6 +718,17 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
# make sure limit doesn't get applied to a different stats bucket
request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'i!'
assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11)
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
urls_before = warcprox_.proxy.running_stats.urls
@ -726,14 +738,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['Warcprox-Meta']
data = json.loads(response.headers['Warcprox-Meta'])
assert data['capture-metadata']
response_meta = json.loads(response.headers['Warcprox-Meta'])
assert response_meta['capture-metadata']
try:
dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'],
'%Y-%m-%dT%H:%M:%SZ')
assert dt
except ValueError:
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
pytest.fail(
'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r',
meta['capture-metadata']['timestamp'])
# wait for postfetch chain (or subsequent test could fail)
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
@ -997,6 +1011,7 @@ def test_domain_doc_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
urls_before = warcprox_.proxy.running_stats.urls
# ** comment is obsolete (server is multithreaded) but still useful **
# we need to clear the connection pool here because
# - connection pool already may already have an open connection localhost
# - we're about to make a connection to foo.localhost
@ -1132,6 +1147,23 @@ def test_domain_doc_soft_limit(
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
# make sure soft limit doesn't get applied to a different stats bucket
request_meta = {
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]},
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
def test_domain_data_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
urls_before = warcprox_.proxy.running_stats.urls
@ -1226,6 +1258,22 @@ def test_domain_data_soft_limit(
### assert response.headers["content-type"] == "text/plain;charset=utf-8"
### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
# make sure soft limit doesn't get applied to a different stats bucket
request_meta = {
"stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]},
"soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200},
}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'y!'
assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
# XXX this test relies on a tor proxy running at localhost:9050 with a working
# connection to the internet, and relies on a third party site (facebook) being
# up and behaving a certain way

View File

@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'):
action='append', help=(
'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
'May be used multiple times to register multiple plugins. '
'See README.rst for more information.'))
'See readme.rst for more information.'))
arg_parser.add_argument('--version', action='version',
version="warcprox {}".format(warcprox.__version__))
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')

View File

@ -53,6 +53,53 @@ def _empty_bucket(bucket):
},
}
def unravel_buckets(url, warcprox_meta):
'''
Unravels bucket definitions in Warcprox-Meta header. Each bucket
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Returns:
list of strings
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
In this case the return value would be
["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
'''
buckets = ["__all__"]
if (warcprox_meta and "stats" in warcprox_meta
and "buckets" in warcprox_meta["stats"]):
for bucket in warcprox_meta["stats"]["buckets"]:
if isinstance(bucket, dict):
if not 'bucket' in bucket:
self.logger.warn(
'ignoring invalid stats bucket in '
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
canon_url = urlcanon.semantic(url)
for domain in bucket['tally-domains']:
domain = urlcanon.normalize_host(domain).decode('ascii')
if urlcanon.url_matches_domain(canon_url, domain):
buckets.append(
'%s:%s' % (bucket['bucket'], domain))
else:
buckets.append(bucket)
else:
buckets.append("__unspecified__")
return buckets
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
logger = logging.getLogger("warcprox.stats.StatsProcessor")
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
return None
def buckets(self, recorded_url):
'''
Unravels bucket definitions in Warcprox-Meta header. Each bucket
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
'''
buckets = ["__all__"]
if (recorded_url.warcprox_meta
and "stats" in recorded_url.warcprox_meta
and "buckets" in recorded_url.warcprox_meta["stats"]):
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
if isinstance(bucket, dict):
if not 'bucket' in bucket:
self.logger.warn(
'ignoring invalid stats bucket in '
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
url = urlcanon.semantic(recorded_url.url)
for domain in bucket['tally-domains']:
domain = urlcanon.normalize_host(domain).decode('ascii')
if urlcanon.url_matches_domain(url, domain):
buckets.append(
'%s:%s' % (bucket['bucket'], domain))
else:
buckets.append(bucket)
else:
buckets.append("__unspecified__")
return buckets
return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
class RethinkStatsProcessor(StatsProcessor):
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
@ -301,11 +309,9 @@ class RunningStats:
need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
if need_minute_snap:
self.minute_snaps.append((now, self.urls, self.warc_bytes))
logging.debug('added minute snap %r', self.minute_snaps[-1])
if need_ten_sec_snap:
self.ten_sec_snaps.popleft()
self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1])
def _closest_ten_sec_snap(self, t):
# it's a deque so iterating over it is faster than indexed lookup

View File

@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
block_rule = urlcanon.MatchRule(**rule)
if block_rule.applies(url):
body = ("request rejected by warcprox: blocked by "
"rule found in Warcprox-Meta header: %s"
% rule).encode("utf-8")
"rule found in Warcprox-Meta header: %s\n"
% json.dumps(rule)).encode("utf-8")
self.send_response(403, "Forbidden")
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Connection", "close")
self.send_header("Content-Length", len(body))
response_meta = {"blocked-by-rule":rule}
response_meta = {"blocked-by-rule": rule}
self.send_header(
"Warcprox-Meta",
json.dumps(response_meta, separators=(",",":")))
@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.client_address[0], self.command,
self.url, rule))
def _enforce_limit(self, limit_key, limit_value, soft=False):
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
if not self.server.stats_db:
return
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
_limit_key = limit_key
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
domain = urlcanon.normalize_host(bucket0_fields[1])
if not urlcanon.host_matches_domain(self.hostname, domain):
return # else host matches, go ahead and enforce the limit
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
# parse limit key
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
# normalize domain if part of bucket
if ":" in bucket0:
b, raw_domain = bucket0.split(":", 1)
domain = urlcanon.normalize_host(raw_domain).decode("ascii")
bucket0 = "%s:%s" % (b, domain)
limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
if not bucket0 in buckets:
return
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
if value and limit_value and limit_value > 0 and value >= limit_value:
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
"soft limit" if soft else "limit", _limit_key,
"soft limit" if soft else "limit", limit_key,
limit_value)).encode("utf-8")
if soft:
self.send_response(430, "Reached soft limit")
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
"stats": {bucket0:self.server.stats_db.value(bucket0)}
}
if soft:
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
response_meta["reached-soft-limit"] = {limit_key:limit_value}
else:
response_meta["reached-limit"] = {_limit_key:limit_value}
response_meta["reached-limit"] = {limit_key:limit_value}
self.send_header(
"Warcprox-Meta",
json.dumps(response_meta, separators=(",",":")))
"Warcprox-Meta", json.dumps(response_meta, separators=",:"))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.client_address[0], 430 if soft else 420,
self.command, self.url,
"soft limit" if soft else "limit",
_limit_key, limit_value))
limit_key, limit_value))
def _enforce_limits(self, warcprox_meta):
"""
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
reached.
"""
buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
if warcprox_meta and "limits" in warcprox_meta:
for item in warcprox_meta["limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=False)
self._enforce_limit(buckets, limit_key, limit_value, soft=False)
if warcprox_meta and "soft-limits" in warcprox_meta:
for item in warcprox_meta["soft-limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=True)
self._enforce_limit(buckets, limit_key, limit_value, soft=True)
def _security_check(self, warcprox_meta):
'''