Merge pull request #93 from nlevitt/docs

docs
2025-01-18 13:22:09 +01:00 · 2018-05-30 15:57:50 -07:00 · 2018-05-30 15:57:50 -07:00 · a1356709df
commit a1356709df
parent 997d4341fe 6f43286b07
8 changed files with 618 additions and 257 deletions
--- a/README.rst
+++ b/README.rst
@ -1,186 +0,0 @@
 warcprox - WARC writing MITM HTTP/S proxy
 -----------------------------------------
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
    :target: https://travis-ci.org/internetarchive/warcprox
 Based on the excellent and simple pymiproxy by Nadeem Douba.
 https://github.com/allfro/pymiproxy
 Install
 ~~~~~~~
 Warcprox runs on python 3.4+.
 To install latest release run:
 ::
    # apt-get install libffi-dev libssl-dev
    pip install warcprox
 You can also install the latest bleeding edge code:
 ::
    pip install git+https://github.com/internetarchive/warcprox.git
 Trusting the CA cert
 ~~~~~~~~~~~~~~~~~~~~
 For best results while browsing through warcprox, you need to add the CA
 cert as a trusted cert in your browser. If you don't do that, you will
 get the warning when you visit each new site. But worse, any embedded
 https content on a different server will simply fail to load, because
 the browser will reject the certificate without telling you.
 Plugins
 ~~~~~~~
 Warcprox supports a limited notion of plugins by way of the `--plugin` command
 line argument. Plugin classes are loaded from the regular python module search
 path. They will be instantiated with one argument, a `warcprox.Options`, which
 holds the values of all the command line arguments. Legacy plugins with
 constructors that take no arguments are also supported. Plugins should either
 have a method `notify(self, recorded_url, records)` or should subclass
 `warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
 specifying `--plugin` multiples times.
 `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
 Usage
 ~~~~~
 ::
    usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
                    [--certs-dir CERTS_DIR] [-d DIRECTORY]
                    [--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
                    [-s ROLLOVER_SIZE]
                    [--rollover-idle-time ROLLOVER_IDLE_TIME]
                    [-g DIGEST_ALGORITHM] [--base32]
                    [--method-filter HTTP_METHOD]
                    [--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
                    [-P PLAYBACK_PORT]
                    [-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
                    [--rethinkdb-services-url RETHINKDB_SERVICES_URL]
                    [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
                    [--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
                    [--version] [-v] [--trace] [-q]
    warcprox - WARC writing MITM HTTP/S proxy
    optional arguments:
      -h, --help            show this help message and exit
      -p PORT, --port PORT  port to listen on (default: 8000)
      -b ADDRESS, --address ADDRESS
                            address to listen on (default: localhost)
      -c CACERT, --cacert CACERT
                            CA certificate file; if file does not exist, it
                            will be created (default:
                            ./ayutla.monkeybrains.net-warcprox-ca.pem)
      --certs-dir CERTS_DIR
                            where to store and load generated certificates
                            (default: ./ayutla.monkeybrains.net-warcprox-ca)
      -d DIRECTORY, --dir DIRECTORY
                            where to write warcs (default: ./warcs)
      --warc-filename WARC_FILENAME
                            define custom WARC filename with variables
                            {prefix}, {timestamp14}, {timestamp17},
                            {serialno}, {randomtoken}, {hostname},
                            {shorthostname} (default:
                            {prefix}-{timestamp17}-{serialno}-{randomtoken})
      -z, --gzip            write gzip-compressed warc records
      -n PREFIX, --prefix PREFIX
                            default WARC filename prefix (default: WARCPROX)
      -s ROLLOVER_SIZE, --size ROLLOVER_SIZE
                            WARC file rollover size threshold in bytes
                            (default: 1000000000)
      --rollover-idle-time ROLLOVER_IDLE_TIME
                            WARC file rollover idle time threshold in seconds
                            (so that Friday's last open WARC doesn't sit there
                            all weekend waiting for more data) (default: None)
      -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
                            digest algorithm, one of sha384, sha224, md5,
                            sha256, sha512, sha1 (default: sha1)
      --base32              write digests in Base32 instead of hex
      --method-filter HTTP_METHOD
                            only record requests with the given http method(s)
                            (can be used more than once) (default: None)
      --stats-db-file STATS_DB_FILE
                            persistent statistics database file; empty string
                            or /dev/null disables statistics tracking
                            (default: ./warcprox.sqlite)
      --rethinkdb-stats-url RETHINKDB_STATS_URL
                            rethinkdb stats table url, e.g. rethinkdb://db0.fo
                            o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
                            ble (default: None)
      -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
                            port to listen on for instant playback (default:
                            None)
      -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
                            persistent deduplication database file; empty
                            string or /dev/null disables deduplication
                            (default: ./warcprox.sqlite)
      --rethinkdb-dedup-url RETHINKDB_DEDUP_URL
                            rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
                            db1.foo.org:38015/my_warcprox_db/my_dedup_table
                            (default: None)
      --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
                            rethinkdb big table url (table will be populated
                            with various capture information and is suitable
                            for use as index for playback), e.g. rethinkdb://d
                            b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
                            es (default: None)
      --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
                            🐷 url pointing to trough configuration rethinkdb
                            database, e.g. rethinkdb://db0.foo.org,db1.foo.org
                            :38015/trough_configuration (default: None)
      --cdxserver-dedup CDXSERVER_DEDUP
                            use a CDX Server URL for deduplication; e.g.
                            https://web.archive.org/cdx/search (default: None)
      --rethinkdb-services-url RETHINKDB_SERVICES_URL
                            rethinkdb service registry table url; if provided,
                            warcprox will create and heartbeat entry for
                            itself (default: None)
      --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
                            host:port of tor socks proxy, used only to connect
                            to .onion sites (default: None)
      --crawl-log-dir CRAWL_LOG_DIR
                            if specified, write crawl log files in the
                            specified directory; one crawl log is written per
                            warc filename prefix; crawl log format mimics
                            heritrix (default: None)
      --plugin PLUGIN_CLASS
                            Qualified name of plugin class, e.g.
                            "mypkg.mymod.MyClass". May be used multiple times
                            to register multiple plugins. See README.rst for
                            more information. (default: None)
      --version             show program's version number and exit
      -v, --verbose
      --trace
      -q, --quiet
 License
 ~~~~~~~
 Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
 GPL.
 * Copyright (C) 2012 Cygnos Corporation
 * Copyright (C) 2013-2018 Internet Archive
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
--- a/api.rst
+++ b/api.rst
@ -0,0 +1,320 @@
 warcprox API
 ************
 Means of interacting with warcprox over http, aside from simply proxying urls.
 .. contents::
 ``/status`` url
 ===============
 If warcprox is running at localhost:8000, http://localhost:8000/status returns
 a json blob with a bunch of status info. For example:
 ::
    $ curl -sS http://localhost:8000/status
    {
      "rates_5min": {
        "warc_bytes_per_sec": 0.0,
        "urls_per_sec": 0.0,
        "actual_elapsed": 277.2983281612396
      },
      "version": "2.4b2.dev174",
      "load": 0.0,
      "seconds_behind": 0.0,
      "threads": 100,
      "warc_bytes_written": 0,
      "port": 8000,
      "postfetch_chain": [
        {
          "queued_urls": 0,
          "processor": "SkipFacebookCaptchas"
        },
        {
          "queued_urls": 0,
          "processor": "BatchTroughLoader"
        },
        {
          "queued_urls": 0,
          "processor": "WarcWriterProcessor"
        },
        {
          "queued_urls": 0,
          "processor": "BatchTroughStorer"
        },
        {
          "queued_urls": 0,
          "processor": "RethinkStatsProcessor"
        },
        {
          "queued_urls": 0,
          "processor": "CrawlLogger"
        },
        {
          "queued_urls": 0,
          "processor": "TroughFeed"
        },
        {
          "queued_urls": 0,
          "processor": "RunningStats"
        }
      ],
      "queue_max_size": 500,
      "role": "warcprox",
      "queued_urls": 0,
      "active_requests": 1,
      "host": "wbgrp-svc405.us.archive.org",
      "rates_15min": {
        "warc_bytes_per_sec": 0.0,
        "urls_per_sec": 0.0,
        "actual_elapsed": 876.9885368347168
      },
      "unaccepted_requests": 0,
      "urls_processed": 0,
      "pid": 18841,
      "address": "127.0.0.1",
      "rates_1min": {
        "warc_bytes_per_sec": 0.0,
        "urls_per_sec": 0.0,
        "actual_elapsed": 54.92501664161682
      },
      "start_time": 1526690353.4060142
    }
 ``WARCPROX_WRITE_RECORD`` http method
 =====================================
 To make warcprox write an arbitrary warc record you can send it a special
 request with http method ``WARCPROX_WRITE_RECORD``. The http request must
 include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
 Warcprox will use these to populate the warc record. For example::
    $ ncat --crlf 127.0.0.1 8000 <<EOF
    > WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
    > WARC-Type: resource
    > Content-type: text/plain;charset=utf-8
    > Content-length: 29
    > 
    > i am a warc record payload!
    > EOF
    HTTP/1.0 204 OK
    Server: BaseHTTP/0.6 Python/3.6.3
    Date: Tue, 22 May 2018 19:21:02 GMT
 On success warcprox responds with http status 204. For the request above
 warcprox will write a warc record that looks like this::
    WARC/1.0
    WARC-Type: resource
    WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
    WARC-Date: 2018-05-21T23:33:31Z
    WARC-Target-URI: special://url/some?thing
    WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
    WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
    Content-Type: text/plain;charset=utf-8
    Content-Length: 29
    i am a warc record payload!
 ``Warcprox-Meta`` http request header
 =====================================
 ``Warcprox-Meta`` is a special http request header that can be used to pass
 configuration information and metadata with each proxy request to warcprox. The
 value is a json blob. There are several fields understood by warcprox, and
 arbitrary additional fields can be included. If warcprox doesn't recognize a
 field it simply ignores it. Custom fields may be useful for custom warcprox
 plugins (see `<readme.rst#plugins>`_).
 Warcprox strips the ``warcprox-meta`` header out before sending the request to
 remote server, and does not write it in the warc request record.
 Brozzler knows about ``warcprox-meta``. For information on configuring
 it in brozzler, see
 https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
 ``Warcprox-Meta`` is often a very important part of brozzler job configuration.
 It is the way url and data limits on jobs, seeds, and hosts are implemented,
 among other things.
 Warcprox-Meta fields
 --------------------
 ``warc-prefix`` (string)
 ~~~~~~~~~~~~~~~~~~~~~~~~
 Specifies a warc filename prefix. Warcprox will write the warc record for this
 capture, if any, to a warc named accordingly.
 Example::
    Warcprox-Meta: {"warc-prefix": "special-warc"}
 ``dedup-bucket`` (string)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 Specifies the deduplication bucket. For more information about deduplication
 see `<readme.rst#deduplication>`_.
 Example::
    Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
 ``blocks`` (list)
 ~~~~~~~~~~~~~~~~~
 List of url match rules. Url match rules are somewhat described at
 https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
 and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
 (TODO: write a better doc and link to it)
 Example::
    Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
 If any of the rules match the url being requested, warcprox aborts normal
 processing and responds with a http ``403``. The http response includes
 a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
 which reproduces the value of the match rule that resulted in the block. The
 presence of the ``warcprox-meta`` response header can be used by the client to
 distinguish this type of a response from a 403 from the remote site.
 An example::
    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
    HTTP/1.0 403 Forbidden
    Server: BaseHTTP/0.6 Python/3.6.3
    Date: Fri, 25 May 2018 22:46:42 GMT
    Content-Type: text/plain;charset=utf-8
    Connection: close
    Content-Length: 111
    Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
    request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
 You might be wondering why ``blocks`` is necessary. Why would the warcprox
 client make a request that it should already know will be blocked by the proxy?
 The answer is that the request may be initiated somewhere where it's difficult
 to evaluate the block rules. In particular, this circumstance prevails when the
 browser controlled by brozzler is requesting images, javascript, css, and so
 on, embedded in a page.
 ``stats`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~
 ``stats`` is a dictionary with only one field understood by warcprox,
 ``buckets``. The value of ``buckets`` is a list of strings and/or
 dictionaries. A string signifies the name of the bucket; a dictionary is
 expected to have at least an item with key ``bucket`` whose value is the name
 of the bucket. The other currently recognized key is ``tally-domains``, which
 if supplied should be a list of domains. This instructs warcprox to
 additionally tally substats of the given bucket by domain.
 See `<readme.rst#statistics>`_ for more information on statistics kept by
 warcprox.
 Examples::
    Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
 Domain stats are stored in the stats table under the key
 ``"bucket2:foo.bar.com"`` for the latter example. See the following two
 sections for more examples. The ``soft-limits`` section has an example of a
 limit on a domain specified in ``tally-domains``.
 ``limits`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~
 Specifies quantitative limits for warcprox to enforce. The structure of the
 dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
 format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
 further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
 If processing a request would result in exceeding a limit, warcprox aborts
 normal processing and responds with a http ``420 Reached Limit``. The http
 response includes a ``Warcprox-Meta`` response header with the complete set
 of statistics for the bucket whose limit has been reached.
 Example::
    Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
 ::
    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
    HTTP/1.0 420 Reached limit
    Server: BaseHTTP/0.6 Python/3.6.3
    Date: Fri, 25 May 2018 23:08:32 GMT
    Content-Type: text/plain;charset=utf-8
    Connection: close
    Content-Length: 77
    Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
    request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
 ``soft-limits`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 From warcprox's perspective ``soft-limits`` work almost exactly the same way
 as ``limits``. The only difference is that when a soft limit is hit, warcprox
 response with an http ``430 Reached soft limit`` instead of http ``420``.
 Warcprox clients might treat a ``430`` very differently from a ``420``. From
 brozzler's perspective, for instance, ``soft-limits`` are very different from
 ``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
 has been reached, this means that crawling for that seed is finished, and
 brozzler sets about finalizing the crawl of that seed. On the other hand,
 brozzler blissfully ignores ``430`` responses, because soft limits only apply
 to a particular bucket (like a domain), and don't have any effect on crawling
 of urls that don't fall in that bucket.
 Example::
    Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
 ::
    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
    HTTP/1.0 430 Reached soft limit
    Server: BaseHTTP/0.6 Python/3.6.3
    Date: Fri, 25 May 2018 23:12:06 GMT
    Content-Type: text/plain;charset=utf-8
    Connection: close
    Content-Length: 82
    Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
    request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
 ``metadata`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
 that if it has a ``seed`` entry and crawl logs are enabled via the
 ``--crawl-log-dir`` command line option, the value of ``seed`` is written to
 the crawl log as the 11th field on the line, simulating heritrix's "source
 tag".
 Example::
    Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
 ``accept`` (list)
 ~~~~~~~~~~~~~~~~~
 Specifies fields that the client would like to receive in the ``Warcprox-Meta``
 response header. Only one value is currently understood,
 ``capture-metadata``.
 Example::
    Warcprox-Meta: {"accept": ["capture-metadata"]}
 The response will include a ``Warcprox-Meta`` response header with one field
 also called ``captured-metadata``. Currently warcprox reports one piece of
 capture medata, ``timestamp``, which represents the time fetch began for the
 resource and matches the ``WARC-Date`` written to the warc record. For
 example::
    Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
 ``Warcprox-Meta`` http response header
 ======================================
 In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
 that it sends to the client. As with the request header, the value is a json
 blob. It is only included if something in the ``warcprox-meta`` request header
 calls for it. Those cases are described above in the `Warcprox-Meta http
 request header`_ section.
--- a/readme.rst
+++ b/readme.rst
@ -0,0 +1,173 @@
 Warcprox - WARC writing MITM HTTP/S proxy
 *****************************************
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
    :target: https://travis-ci.org/internetarchive/warcprox
 Warcprox is a tool for archiving the web. It is an http proxy that stores its
 traffic to disk in `WARC
 <https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
 format. Warcprox captures encrypted https traffic by using the
 `"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
 technique (see the `Man-in-the-middle`_ section for more info).
 The web pages that warcprox stores in WARC files can be played back using
 software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
 <https://github.com/webrecorder/pywb>`_. Warcprox has been developed in
 parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ and
 together they make a comprehensive modern distributed archival web crawling
 system.
 Warcprox was originally based on the excellent and simple pymiproxy by Nadeem
 Douba. https://github.com/allfro/pymiproxy
 .. contents::
 Getting started
 ===============
 Warcprox runs on python 3.4+.
 To install latest release run::
    # apt-get install libffi-dev libssl-dev
    pip install warcprox
 You can also install the latest bleeding edge code::
    pip install git+https://github.com/internetarchive/warcprox.git
 To start warcprox run::
    warcprox
 Try ``warcprox --help`` for documentation on command line options.
 Man-in-the-middle
 =================
 Normally, http proxies can't read https traffic, because it's encrypted. The
 browser uses the http ``CONNECT`` method to establish a tunnel through the
 proxy, and the proxy merely routes raw bytes between the client and server.
 Since the bytes are encrypted, the proxy can't make sense of the information
 it's proxying. This nonsensical encrypted data would not be very useful to
 archive.
 In order to capture https traffic, warcprox acts as a "man-in-the-middle"
 (MITM). When it receives a ``CONNECT`` directive from a client, it generates a
 public key certificate for the requested site, presents to the client, and
 proceeds to establish an encrypted connection with the client. Then it makes a
 separate, normal https connection to the remote site. It decrypts, archives,
 and re-encrypts traffic in both directions.
 Although "man-in-the-middle" is often paired with "attack", there is nothing
 malicious about what warcprox is doing. If you configure an instance of
 warcprox as your browser's http proxy, you will see lots of certificate
 warnings, since none of the certificates will be signed by trusted authorities.
 To use warcprox effectively the client needs to disable certificate
 verification, or add the CA cert generated by warcprox as a trusted authority.
 (If you do this in your browser, make sure you undo it when you're done using
 warcprox!)
 API
 ===
 For interacting with a running instance of warcprox.
 * ``/status`` url
 * ``WARCPROX_WRITE_RECORD`` http method
 * ``Warcprox-Meta`` http request header and response header
 See `<api.rst>`_.
 Deduplication
 =============
 Warcprox avoids archiving redundant content by "deduplicating" it. The process
 for deduplication works similarly to heritrix and other web archiving tools.
 1. while fetching url, calculate payload content digest (typically sha1)
 2. look up digest in deduplication database (warcprox supports a few different
   ones)
 3. if found, write warc ``revisit`` record referencing the url and capture time
   of the previous capture
 4. else (if not found),
   a. write warc ``response`` record with full payload
   b. store entry in deduplication database
 The dedup database is partitioned into different "buckets". Urls are
 deduplicated only against other captures in the same bucket. If specified, the
 ``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines
 the bucket, otherwise the default bucket is used.
 Deduplication can be disabled entirely by starting warcprox with the argument
 ``--dedup-db-file=/dev/null``.
 Statistics
 ==========
 Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
 These are consulted for enforcing ``limits`` and ``soft-limits`` (see
 `<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
 processes outside of warcprox, for reporting etc.
 Statistics are grouped by "bucket". Every capture is counted as part of the
 ``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
 request header. The fallback bucket in case none is specified is called
 ``__unspecified__``.
 Within each bucket are three sub-buckets:
 * ``new`` - tallies captures for which a complete record (usually a ``response``
  record) was written to warc
 * ``revisit`` - tallies captures for which a ``revisit`` record was written to
  warc
 * ``total`` - includes all urls processed, even those not written to warc (so the
  numbers may be greater than new + revisit)
 Within each of these sub-buckets we keep two statistics:
 * ``urls`` - simple count of urls
 * ``wire_bytes`` - sum of bytes received over the wire, including http headers,
  from the remote server for each url
 For historical reasons, in sqlite, the default store, statistics are kept as
 json blobs::
    sqlite> select * from buckets_of_stats;
    bucket           stats
    ---------------  ---------------------------------------------------------------------------------------------
    __unspecified__  {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
    __all__          {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
 Plugins
 =======
 Warcprox supports a limited notion of plugins by way of the ``--plugin``
 command line argument. Plugin classes are loaded from the regular python module
 search path. They will be instantiated with one argument, a
 ``warcprox.Options``, which holds the values of all the command line arguments.
 Legacy plugins with constructors that take no arguments are also supported.
 Plugins should either have a method ``notify(self, recorded_url, records)`` or
 should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can
 be configured by specifying ``--plugin`` multiples times.
 `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
 License
 =======
 Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
 GPL.
 * Copyright (C) 2012 Cygnos Corporation
 * Copyright (C) 2013-2018 Internet Archive
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
--- a/setup.py
+++ b/setup.py
@ -45,7 +45,7 @@ setuptools.setup(
        url='https://github.com/internetarchive/warcprox',
        author='Noah Levitt',
        author_email='nlevitt@archive.org',
-        long_description=open('README.rst').read(),
+        long_description=open('readme.rst').read(),
        license='GPL',
        packages=['warcprox'],
        install_requires=deps,
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
    # wait for postfetch chain
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
    # next fetch hits the limit
    response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
    assert response.status_code == 420
    assert response.reason == "Reached limit"
@ -717,6 +718,17 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
    assert response.headers["content-type"] == "text/plain;charset=utf-8"
    assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
    # make sure limit doesn't get applied to a different stats bucket
    request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
    headers = {"Warcprox-Meta": json.dumps(request_meta)}
    response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
    assert response.status_code == 200
    assert response.headers['warcprox-test-header'] == 'i!'
    assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
    # wait for postfetch chain
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11)
 def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
    urls_before = warcprox_.proxy.running_stats.urls
@ -726,14 +738,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
    response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
    assert response.status_code == 200
    assert response.headers['Warcprox-Meta']
-    data = json.loads(response.headers['Warcprox-Meta'])
+    response_meta = json.loads(response.headers['Warcprox-Meta'])
-    assert data['capture-metadata']
+    assert response_meta['capture-metadata']
    try:
-        dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
+        dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'],
                                        '%Y-%m-%dT%H:%M:%SZ')
        assert dt
    except ValueError:
-        pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
+        pytest.fail(
                'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r',
                meta['capture-metadata']['timestamp'])
    # wait for postfetch chain (or subsequent test could fail)
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
@ -997,6 +1011,7 @@ def test_domain_doc_soft_limit(
        http_daemon, https_daemon, warcprox_, archiving_proxies):
    urls_before = warcprox_.proxy.running_stats.urls
    # ** comment is obsolete (server is multithreaded) but still useful **
    # we need to clear the connection pool here because
    # - connection pool already may already have an open connection localhost
    # - we're about to make a connection to foo.localhost
@ -1132,6 +1147,23 @@ def test_domain_doc_soft_limit(
    assert response.headers["content-type"] == "text/plain;charset=utf-8"
    assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
    # make sure soft limit doesn't get applied to a different stats bucket
    request_meta = {
        "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]},
        "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
    }
    headers = {"Warcprox-Meta": json.dumps(request_meta)}
    url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
    response = requests.get(
            url, proxies=archiving_proxies, headers=headers, stream=True,
            verify=False)
    assert response.status_code == 200
    assert response.headers['warcprox-test-header'] == 'o!'
    assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
    # wait for postfetch chain
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
 def test_domain_data_soft_limit(
        http_daemon, https_daemon, warcprox_, archiving_proxies):
    urls_before = warcprox_.proxy.running_stats.urls
@ -1226,6 +1258,22 @@ def test_domain_data_soft_limit(
    ### assert response.headers["content-type"] == "text/plain;charset=utf-8"
    ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
    # make sure soft limit doesn't get applied to a different stats bucket
    request_meta = {
        "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]},
        "soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200},
    }
    headers = {"Warcprox-Meta": json.dumps(request_meta)}
    url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port)
    response = requests.get(
            url, proxies=archiving_proxies, headers=headers, stream=True)
    assert response.status_code == 200
    assert response.headers['warcprox-test-header'] == 'y!'
    assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
    # wait for postfetch chain
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
 # XXX this test relies on a tor proxy running at localhost:9050 with a working
 # connection to the internet, and relies on a third party site (facebook) being
 # up and behaving a certain way
--- a/warcprox/main.py
+++ b/warcprox/main.py
@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'):
            action='append', help=(
                'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
                'May be used multiple times to register multiple plugins. '
-                'See README.rst for more information.'))
+                'See readme.rst for more information.'))
    arg_parser.add_argument('--version', action='version',
            version="warcprox {}".format(warcprox.__version__))
    arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@ -53,6 +53,53 @@ def _empty_bucket(bucket):
        },
    }
 def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.
    Returns:
        list of strings
    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):
    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warn(
                            'ignoring invalid stats bucket in '
                            'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode('ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append(
                                    '%s:%s' % (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")
    return buckets
 class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
    logger = logging.getLogger("warcprox.stats.StatsProcessor")
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
            return None
    def buckets(self, recorded_url):
-        '''
+        return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
        Unravels bucket definitions in Warcprox-Meta header. Each bucket
        definition can either be a string, which signifies the name of the
        bucket, or a dict. If a dict it is expected to have at least an item
        with key 'bucket' whose value is the name of the bucket. The other
        currently recognized item is 'tally-domains', which if supplied should
        be a list of domains. This instructs warcprox to additionally tally
        substats of the given bucket by domain. Host stats are stored in the
        stats table under the key '{parent-bucket}:{domain(normalized)}'.
        Example Warcprox-Meta header (a real one will likely have other
        sections besides 'stats'):
        Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
        '''
        buckets = ["__all__"]
        if (recorded_url.warcprox_meta
                and "stats" in recorded_url.warcprox_meta
                and "buckets" in recorded_url.warcprox_meta["stats"]):
            for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
                if isinstance(bucket, dict):
                    if not 'bucket' in bucket:
                        self.logger.warn(
                                'ignoring invalid stats bucket in '
                                'warcprox-meta header %s', bucket)
                        continue
                    buckets.append(bucket['bucket'])
                    if bucket.get('tally-domains'):
                        url = urlcanon.semantic(recorded_url.url)
                        for domain in bucket['tally-domains']:
                            domain = urlcanon.normalize_host(domain).decode('ascii')
                            if urlcanon.url_matches_domain(url, domain):
                                buckets.append(
                                        '%s:%s' % (bucket['bucket'], domain))
                else:
                    buckets.append(bucket)
        else:
            buckets.append("__unspecified__")
        return buckets
 class RethinkStatsProcessor(StatsProcessor):
    logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
@ -301,11 +309,9 @@ class RunningStats:
        need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
        if need_minute_snap:
            self.minute_snaps.append((now, self.urls, self.warc_bytes))
            logging.debug('added minute snap %r', self.minute_snaps[-1])
        if need_ten_sec_snap:
            self.ten_sec_snaps.popleft()
            self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
            logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1])
    def _closest_ten_sec_snap(self, t):
        # it's a deque so iterating over it is faster than indexed lookup
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                block_rule = urlcanon.MatchRule(**rule)
                if block_rule.applies(url):
                    body = ("request rejected by warcprox: blocked by "
-                            "rule found in Warcprox-Meta header: %s"
+                            "rule found in Warcprox-Meta header: %s\n"
-                            % rule).encode("utf-8")
+                            % json.dumps(rule)).encode("utf-8")
                    self.send_response(403, "Forbidden")
                    self.send_header("Content-Type", "text/plain;charset=utf-8")
                    self.send_header("Connection", "close")
                    self.send_header("Content-Length", len(body))
-                    response_meta = {"blocked-by-rule":rule}
+                    response_meta = {"blocked-by-rule": rule}
                    self.send_header(
                            "Warcprox-Meta",
                            json.dumps(response_meta, separators=(",",":")))
@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                                self.client_address[0], self.command,
                                self.url, rule))
-    def _enforce_limit(self, limit_key, limit_value, soft=False):
+    def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
        if not self.server.stats_db:
            return
        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
        _limit_key = limit_key
-        # if limit_key looks like 'job1:foo.com/total/urls' then we only want
+        # parse limit key
-        # to apply this rule if the requested url is within domain
+        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
-        bucket0_fields = bucket0.split(':')
+        # normalize domain if part of bucket
-        if len(bucket0_fields) == 2:
+        if ":" in bucket0:
-            domain = urlcanon.normalize_host(bucket0_fields[1])
+            b, raw_domain = bucket0.split(":", 1)
-            if not urlcanon.host_matches_domain(self.hostname, domain):
+            domain = urlcanon.normalize_host(raw_domain).decode("ascii")
-                return # else host matches, go ahead and enforce the limit
+            bucket0 = "%s:%s" % (b, domain)
-            bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
+            limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
-            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
+
        if not bucket0 in buckets:
            return
        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
        if value and limit_value and limit_value > 0 and value >= limit_value:
            body = ("request rejected by warcprox: reached %s %s=%s\n" % (
-                        "soft limit" if soft else "limit", _limit_key,
+                        "soft limit" if soft else "limit", limit_key,
                        limit_value)).encode("utf-8")
            if soft:
                self.send_response(430, "Reached soft limit")
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                "stats": {bucket0:self.server.stats_db.value(bucket0)}
            }
            if soft:
-                response_meta["reached-soft-limit"] = {_limit_key:limit_value}
+                response_meta["reached-soft-limit"] = {limit_key:limit_value}
            else:
-                response_meta["reached-limit"] = {_limit_key:limit_value}
+                response_meta["reached-limit"] = {limit_key:limit_value}
            self.send_header(
-                    "Warcprox-Meta",
+                    "Warcprox-Meta", json.dumps(response_meta, separators=",:"))
                    json.dumps(response_meta, separators=(",",":")))
            self.end_headers()
            if self.command != "HEAD":
                self.wfile.write(body)
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                        self.client_address[0], 430 if soft else 420,
                        self.command, self.url,
                        "soft limit" if soft else "limit",
-                        _limit_key, limit_value))
+                        limit_key, limit_value))
    def _enforce_limits(self, warcprox_meta):
        """
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
        warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
        reached.
        """
        buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
        if warcprox_meta and "limits" in warcprox_meta:
            for item in warcprox_meta["limits"].items():
                limit_key, limit_value = item
-                self._enforce_limit(limit_key, limit_value, soft=False)
+                self._enforce_limit(buckets, limit_key, limit_value, soft=False)
        if warcprox_meta and "soft-limits" in warcprox_meta:
            for item in warcprox_meta["soft-limits"].items():
                limit_key, limit_value = item
-                self._enforce_limit(limit_key, limit_value, soft=True)
+                self._enforce_limit(buckets, limit_key, limit_value, soft=True)
    def _security_check(self, warcprox_meta):
        '''