diff --git a/README.rst b/README.rst deleted file mode 100644 index 113099b..0000000 --- a/README.rst +++ /dev/null @@ -1,186 +0,0 @@ -warcprox - WARC writing MITM HTTP/S proxy ------------------------------------------ -.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master - :target: https://travis-ci.org/internetarchive/warcprox - -Based on the excellent and simple pymiproxy by Nadeem Douba. -https://github.com/allfro/pymiproxy - -Install -~~~~~~~ - -Warcprox runs on python 3.4+. - -To install latest release run: - -:: - - # apt-get install libffi-dev libssl-dev - pip install warcprox - -You can also install the latest bleeding edge code: - -:: - - pip install git+https://github.com/internetarchive/warcprox.git - - -Trusting the CA cert -~~~~~~~~~~~~~~~~~~~~ - -For best results while browsing through warcprox, you need to add the CA -cert as a trusted cert in your browser. If you don't do that, you will -get the warning when you visit each new site. But worse, any embedded -https content on a different server will simply fail to load, because -the browser will reject the certificate without telling you. - -Plugins -~~~~~~~ - -Warcprox supports a limited notion of plugins by way of the `--plugin` command -line argument. Plugin classes are loaded from the regular python module search -path. They will be instantiated with one argument, a `warcprox.Options`, which -holds the values of all the command line arguments. Legacy plugins with -constructors that take no arguments are also supported. Plugins should either -have a method `notify(self, recorded_url, records)` or should subclass -`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by -specifying `--plugin` multiples times. - -`A minimal example `__ - -Usage -~~~~~ - -:: - - usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT] - [--certs-dir CERTS_DIR] [-d DIRECTORY] - [--warc-filename WARC_FILENAME] [-z] [-n PREFIX] - [-s ROLLOVER_SIZE] - [--rollover-idle-time ROLLOVER_IDLE_TIME] - [-g DIGEST_ALGORITHM] [--base32] - [--method-filter HTTP_METHOD] - [--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL] - [-P PLAYBACK_PORT] - [-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP] - [--rethinkdb-services-url RETHINKDB_SERVICES_URL] - [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY] - [--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS] - [--version] [-v] [--trace] [-q] - - warcprox - WARC writing MITM HTTP/S proxy - - optional arguments: - -h, --help show this help message and exit - -p PORT, --port PORT port to listen on (default: 8000) - -b ADDRESS, --address ADDRESS - address to listen on (default: localhost) - -c CACERT, --cacert CACERT - CA certificate file; if file does not exist, it - will be created (default: - ./ayutla.monkeybrains.net-warcprox-ca.pem) - --certs-dir CERTS_DIR - where to store and load generated certificates - (default: ./ayutla.monkeybrains.net-warcprox-ca) - -d DIRECTORY, --dir DIRECTORY - where to write warcs (default: ./warcs) - --warc-filename WARC_FILENAME - define custom WARC filename with variables - {prefix}, {timestamp14}, {timestamp17}, - {serialno}, {randomtoken}, {hostname}, - {shorthostname} (default: - {prefix}-{timestamp17}-{serialno}-{randomtoken}) - -z, --gzip write gzip-compressed warc records - -n PREFIX, --prefix PREFIX - default WARC filename prefix (default: WARCPROX) - -s ROLLOVER_SIZE, --size ROLLOVER_SIZE - WARC file rollover size threshold in bytes - (default: 1000000000) - --rollover-idle-time ROLLOVER_IDLE_TIME - WARC file rollover idle time threshold in seconds - (so that Friday's last open WARC doesn't sit there - all weekend waiting for more data) (default: None) - -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM - digest algorithm, one of sha384, sha224, md5, - sha256, sha512, sha1 (default: sha1) - --base32 write digests in Base32 instead of hex - --method-filter HTTP_METHOD - only record requests with the given http method(s) - (can be used more than once) (default: None) - --stats-db-file STATS_DB_FILE - persistent statistics database file; empty string - or /dev/null disables statistics tracking - (default: ./warcprox.sqlite) - --rethinkdb-stats-url RETHINKDB_STATS_URL - rethinkdb stats table url, e.g. rethinkdb://db0.fo - o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta - ble (default: None) - -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT - port to listen on for instant playback (default: - None) - -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE - persistent deduplication database file; empty - string or /dev/null disables deduplication - (default: ./warcprox.sqlite) - --rethinkdb-dedup-url RETHINKDB_DEDUP_URL - rethinkdb dedup url, e.g. rethinkdb://db0.foo.org, - db1.foo.org:38015/my_warcprox_db/my_dedup_table - (default: None) - --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL - rethinkdb big table url (table will be populated - with various capture information and is suitable - for use as index for playback), e.g. rethinkdb://d - b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur - es (default: None) - --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL - 🐷 url pointing to trough configuration rethinkdb - database, e.g. rethinkdb://db0.foo.org,db1.foo.org - :38015/trough_configuration (default: None) - --cdxserver-dedup CDXSERVER_DEDUP - use a CDX Server URL for deduplication; e.g. - https://web.archive.org/cdx/search (default: None) - --rethinkdb-services-url RETHINKDB_SERVICES_URL - rethinkdb service registry table url; if provided, - warcprox will create and heartbeat entry for - itself (default: None) - --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY - host:port of tor socks proxy, used only to connect - to .onion sites (default: None) - --crawl-log-dir CRAWL_LOG_DIR - if specified, write crawl log files in the - specified directory; one crawl log is written per - warc filename prefix; crawl log format mimics - heritrix (default: None) - --plugin PLUGIN_CLASS - Qualified name of plugin class, e.g. - "mypkg.mymod.MyClass". May be used multiple times - to register multiple plugins. See README.rst for - more information. (default: None) - --version show program's version number and exit - -v, --verbose - --trace - -q, --quiet - -License -~~~~~~~ - -Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also -GPL. - -* Copyright (C) 2012 Cygnos Corporation -* Copyright (C) 2013-2018 Internet Archive - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - diff --git a/api.rst b/api.rst new file mode 100644 index 0000000..bac642b --- /dev/null +++ b/api.rst @@ -0,0 +1,320 @@ +warcprox API +************ + +Means of interacting with warcprox over http, aside from simply proxying urls. + +.. contents:: + +``/status`` url +=============== + +If warcprox is running at localhost:8000, http://localhost:8000/status returns +a json blob with a bunch of status info. For example: + +:: + + $ curl -sS http://localhost:8000/status + { + "rates_5min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 277.2983281612396 + }, + "version": "2.4b2.dev174", + "load": 0.0, + "seconds_behind": 0.0, + "threads": 100, + "warc_bytes_written": 0, + "port": 8000, + "postfetch_chain": [ + { + "queued_urls": 0, + "processor": "SkipFacebookCaptchas" + }, + { + "queued_urls": 0, + "processor": "BatchTroughLoader" + }, + { + "queued_urls": 0, + "processor": "WarcWriterProcessor" + }, + { + "queued_urls": 0, + "processor": "BatchTroughStorer" + }, + { + "queued_urls": 0, + "processor": "RethinkStatsProcessor" + }, + { + "queued_urls": 0, + "processor": "CrawlLogger" + }, + { + "queued_urls": 0, + "processor": "TroughFeed" + }, + { + "queued_urls": 0, + "processor": "RunningStats" + } + ], + "queue_max_size": 500, + "role": "warcprox", + "queued_urls": 0, + "active_requests": 1, + "host": "wbgrp-svc405.us.archive.org", + "rates_15min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 876.9885368347168 + }, + "unaccepted_requests": 0, + "urls_processed": 0, + "pid": 18841, + "address": "127.0.0.1", + "rates_1min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 54.92501664161682 + }, + "start_time": 1526690353.4060142 + } + +``WARCPROX_WRITE_RECORD`` http method +===================================== + +To make warcprox write an arbitrary warc record you can send it a special +request with http method ``WARCPROX_WRITE_RECORD``. The http request must +include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``. +Warcprox will use these to populate the warc record. For example:: + + $ ncat --crlf 127.0.0.1 8000 < WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1 + > WARC-Type: resource + > Content-type: text/plain;charset=utf-8 + > Content-length: 29 + > + > i am a warc record payload! + > EOF + HTTP/1.0 204 OK + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Tue, 22 May 2018 19:21:02 GMT + +On success warcprox responds with http status 204. For the request above +warcprox will write a warc record that looks like this:: + + WARC/1.0 + WARC-Type: resource + WARC-Record-ID: + WARC-Date: 2018-05-21T23:33:31Z + WARC-Target-URI: special://url/some?thing + WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + Content-Type: text/plain;charset=utf-8 + Content-Length: 29 + + i am a warc record payload! + +``Warcprox-Meta`` http request header +===================================== + +``Warcprox-Meta`` is a special http request header that can be used to pass +configuration information and metadata with each proxy request to warcprox. The +value is a json blob. There are several fields understood by warcprox, and +arbitrary additional fields can be included. If warcprox doesn't recognize a +field it simply ignores it. Custom fields may be useful for custom warcprox +plugins (see ``_). + +Warcprox strips the ``warcprox-meta`` header out before sending the request to +remote server, and does not write it in the warc request record. + +Brozzler knows about ``warcprox-meta``. For information on configuring +it in brozzler, see +https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta. +``Warcprox-Meta`` is often a very important part of brozzler job configuration. +It is the way url and data limits on jobs, seeds, and hosts are implemented, +among other things. + +Warcprox-Meta fields +-------------------- + +``warc-prefix`` (string) +~~~~~~~~~~~~~~~~~~~~~~~~ +Specifies a warc filename prefix. Warcprox will write the warc record for this +capture, if any, to a warc named accordingly. + +Example:: + + Warcprox-Meta: {"warc-prefix": "special-warc"} + +``dedup-bucket`` (string) +~~~~~~~~~~~~~~~~~~~~~~~~~ +Specifies the deduplication bucket. For more information about deduplication +see ``_. + +Example:: + + Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"} + +``blocks`` (list) +~~~~~~~~~~~~~~~~~ +List of url match rules. Url match rules are somewhat described at +https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping +and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70. +(TODO: write a better doc and link to it) + +Example:: + + Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]} + +If any of the rules match the url being requested, warcprox aborts normal +processing and responds with a http ``403``. The http response includes +a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``, +which reproduces the value of the match rule that resulted in the block. The +presence of the ``warcprox-meta`` response header can be used by the client to +distinguish this type of a response from a 403 from the remote site. + +An example:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo + HTTP/1.0 403 Forbidden + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 22:46:42 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 111 + Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}} + + request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"} + +You might be wondering why ``blocks`` is necessary. Why would the warcprox +client make a request that it should already know will be blocked by the proxy? +The answer is that the request may be initiated somewhere where it's difficult +to evaluate the block rules. In particular, this circumstance prevails when the +browser controlled by brozzler is requesting images, javascript, css, and so +on, embedded in a page. + +``stats`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~ +``stats`` is a dictionary with only one field understood by warcprox, +``buckets``. The value of ``buckets`` is a list of strings and/or +dictionaries. A string signifies the name of the bucket; a dictionary is +expected to have at least an item with key ``bucket`` whose value is the name +of the bucket. The other currently recognized key is ``tally-domains``, which +if supplied should be a list of domains. This instructs warcprox to +additionally tally substats of the given bucket by domain. + +See ``_ for more information on statistics kept by +warcprox. + +Examples:: + + Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}} + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + +Domain stats are stored in the stats table under the key +``"bucket2:foo.bar.com"`` for the latter example. See the following two +sections for more examples. The ``soft-limits`` section has an example of a +limit on a domain specified in ``tally-domains``. + +``limits`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~ +Specifies quantitative limits for warcprox to enforce. The structure of the +dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the +format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for +further explanation of what "bucket", "sub-bucket", and "statistic" mean here. + +If processing a request would result in exceeding a limit, warcprox aborts +normal processing and responds with a http ``420 Reached Limit``. The http +response includes a ``Warcprox-Meta`` response header with the complete set +of statistics for the bucket whose limit has been reached. + +Example:: + + Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} + +:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo + HTTP/1.0 420 Reached limit + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 23:08:32 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 77 + Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}} + + request rejected by warcprox: reached limit test_limits_bucket/total/urls=10 + +``soft-limits`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +From warcprox's perspective ``soft-limits`` work almost exactly the same way +as ``limits``. The only difference is that when a soft limit is hit, warcprox +response with an http ``430 Reached soft limit`` instead of http ``420``. + +Warcprox clients might treat a ``430`` very differently from a ``420``. From +brozzler's perspective, for instance, ``soft-limits`` are very different from +``limits``. When brozzler receives a ``420`` from warcprox because a ``limit`` +has been reached, this means that crawling for that seed is finished, and +brozzler sets about finalizing the crawl of that seed. On the other hand, +brozzler blissfully ignores ``430`` responses, because soft limits only apply +to a particular bucket (like a domain), and don't have any effect on crawling +of urls that don't fall in that bucket. + +Example:: + + Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}} + +:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo + HTTP/1.0 430 Reached soft limit + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 23:12:06 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 82 + Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}} + + request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10 + +``metadata`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~~~ +An arbitrary dictionary. Warcprox mostly ignores this. The one exception is +that if it has a ``seed`` entry and crawl logs are enabled via the +``--crawl-log-dir`` command line option, the value of ``seed`` is written to +the crawl log as the 11th field on the line, simulating heritrix's "source +tag". + +Example:: + + Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"} + +``accept`` (list) +~~~~~~~~~~~~~~~~~ +Specifies fields that the client would like to receive in the ``Warcprox-Meta`` +response header. Only one value is currently understood, +``capture-metadata``. + +Example:: + + Warcprox-Meta: {"accept": ["capture-metadata"]} + +The response will include a ``Warcprox-Meta`` response header with one field +also called ``captured-metadata``. Currently warcprox reports one piece of +capture medata, ``timestamp``, which represents the time fetch began for the +resource and matches the ``WARC-Date`` written to the warc record. For +example:: + + Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}} + +``Warcprox-Meta`` http response header +====================================== +In some cases warcprox will add a ``Warcprox-Meta`` header to the http response +that it sends to the client. As with the request header, the value is a json +blob. It is only included if something in the ``warcprox-meta`` request header +calls for it. Those cases are described above in the `Warcprox-Meta http +request header`_ section. + diff --git a/readme.rst b/readme.rst new file mode 100644 index 0000000..dbb1440 --- /dev/null +++ b/readme.rst @@ -0,0 +1,173 @@ +Warcprox - WARC writing MITM HTTP/S proxy +***************************************** +.. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master + :target: https://travis-ci.org/internetarchive/warcprox + +Warcprox is a tool for archiving the web. It is an http proxy that stores its +traffic to disk in `WARC +`_ +format. Warcprox captures encrypted https traffic by using the +`"man-in-the-middle" `_ +technique (see the `Man-in-the-middle`_ section for more info). + +The web pages that warcprox stores in WARC files can be played back using +software like `OpenWayback `_ or `pywb +`_. Warcprox has been developed in +parallel with `brozzler `_ and +together they make a comprehensive modern distributed archival web crawling +system. + +Warcprox was originally based on the excellent and simple pymiproxy by Nadeem +Douba. https://github.com/allfro/pymiproxy + +.. contents:: + +Getting started +=============== +Warcprox runs on python 3.4+. + +To install latest release run:: + + # apt-get install libffi-dev libssl-dev + pip install warcprox + +You can also install the latest bleeding edge code:: + + pip install git+https://github.com/internetarchive/warcprox.git + +To start warcprox run:: + + warcprox + +Try ``warcprox --help`` for documentation on command line options. + +Man-in-the-middle +================= +Normally, http proxies can't read https traffic, because it's encrypted. The +browser uses the http ``CONNECT`` method to establish a tunnel through the +proxy, and the proxy merely routes raw bytes between the client and server. +Since the bytes are encrypted, the proxy can't make sense of the information +it's proxying. This nonsensical encrypted data would not be very useful to +archive. + +In order to capture https traffic, warcprox acts as a "man-in-the-middle" +(MITM). When it receives a ``CONNECT`` directive from a client, it generates a +public key certificate for the requested site, presents to the client, and +proceeds to establish an encrypted connection with the client. Then it makes a +separate, normal https connection to the remote site. It decrypts, archives, +and re-encrypts traffic in both directions. + +Although "man-in-the-middle" is often paired with "attack", there is nothing +malicious about what warcprox is doing. If you configure an instance of +warcprox as your browser's http proxy, you will see lots of certificate +warnings, since none of the certificates will be signed by trusted authorities. +To use warcprox effectively the client needs to disable certificate +verification, or add the CA cert generated by warcprox as a trusted authority. +(If you do this in your browser, make sure you undo it when you're done using +warcprox!) + +API +=== +For interacting with a running instance of warcprox. + +* ``/status`` url +* ``WARCPROX_WRITE_RECORD`` http method +* ``Warcprox-Meta`` http request header and response header + +See ``_. + +Deduplication +============= +Warcprox avoids archiving redundant content by "deduplicating" it. The process +for deduplication works similarly to heritrix and other web archiving tools. + +1. while fetching url, calculate payload content digest (typically sha1) +2. look up digest in deduplication database (warcprox supports a few different + ones) +3. if found, write warc ``revisit`` record referencing the url and capture time + of the previous capture +4. else (if not found), + + a. write warc ``response`` record with full payload + b. store entry in deduplication database + +The dedup database is partitioned into different "buckets". Urls are +deduplicated only against other captures in the same bucket. If specified, the +``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines +the bucket, otherwise the default bucket is used. + +Deduplication can be disabled entirely by starting warcprox with the argument +``--dedup-db-file=/dev/null``. + +Statistics +========== +Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb. +These are consulted for enforcing ``limits`` and ``soft-limits`` (see +``_), and can also be consulted by other +processes outside of warcprox, for reporting etc. + +Statistics are grouped by "bucket". Every capture is counted as part of the +``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta`` +request header. The fallback bucket in case none is specified is called +``__unspecified__``. + +Within each bucket are three sub-buckets: + +* ``new`` - tallies captures for which a complete record (usually a ``response`` + record) was written to warc +* ``revisit`` - tallies captures for which a ``revisit`` record was written to + warc +* ``total`` - includes all urls processed, even those not written to warc (so the + numbers may be greater than new + revisit) + +Within each of these sub-buckets we keep two statistics: + +* ``urls`` - simple count of urls +* ``wire_bytes`` - sum of bytes received over the wire, including http headers, + from the remote server for each url + +For historical reasons, in sqlite, the default store, statistics are kept as +json blobs:: + + sqlite> select * from buckets_of_stats; + bucket stats + --------------- --------------------------------------------------------------------------------------------- + __unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}} + __all__ {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}} + +Plugins +======= +Warcprox supports a limited notion of plugins by way of the ``--plugin`` +command line argument. Plugin classes are loaded from the regular python module +search path. They will be instantiated with one argument, a +``warcprox.Options``, which holds the values of all the command line arguments. +Legacy plugins with constructors that take no arguments are also supported. +Plugins should either have a method ``notify(self, recorded_url, records)`` or +should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can +be configured by specifying ``--plugin`` multiples times. + +`A minimal example `__ + +License +======= + +Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also +GPL. + +* Copyright (C) 2012 Cygnos Corporation +* Copyright (C) 2013-2018 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + diff --git a/setup.py b/setup.py index 9e7db14..4fc1cbf 100755 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ setuptools.setup( url='https://github.com/internetarchive/warcprox', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('README.rst').read(), + long_description=open('readme.rst').read(), license='GPL', packages=['warcprox'], install_requires=deps, diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 079fdd1..13b6bad 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10) + # next fetch hits the limit response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" @@ -717,6 +718,17 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" + # make sure limit doesn't get applied to a different stats bucket + request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} + headers = {"Warcprox-Meta": json.dumps(request_meta)} + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11) + def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -726,14 +738,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 assert response.headers['Warcprox-Meta'] - data = json.loads(response.headers['Warcprox-Meta']) - assert data['capture-metadata'] + response_meta = json.loads(response.headers['Warcprox-Meta']) + assert response_meta['capture-metadata'] try: - dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'], + dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'], '%Y-%m-%dT%H:%M:%SZ') assert dt except ValueError: - pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp']) + pytest.fail( + 'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r', + meta['capture-metadata']['timestamp']) # wait for postfetch chain (or subsequent test could fail) wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) @@ -997,6 +1011,7 @@ def test_domain_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls + # ** comment is obsolete (server is multithreaded) but still useful ** # we need to clear the connection pool here because # - connection pool already may already have an open connection localhost # - we're about to make a connection to foo.localhost @@ -1132,6 +1147,23 @@ def test_domain_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" + # make sure soft limit doesn't get applied to a different stats bucket + request_meta = { + "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]}, + "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22) + def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -1226,6 +1258,22 @@ def test_domain_data_soft_limit( ### assert response.headers["content-type"] == "text/plain;charset=utf-8" ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n" + # make sure soft limit doesn't get applied to a different stats bucket + request_meta = { + "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]}, + "soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5) + # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way diff --git a/warcprox/main.py b/warcprox/main.py index 6fb46ef..5f45a13 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'): action='append', help=( 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". ' 'May be used multiple times to register multiple plugins. ' - 'See README.rst for more information.')) + 'See readme.rst for more information.')) arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/warcprox/stats.py b/warcprox/stats.py index db2493c..64ff2d7 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -53,6 +53,53 @@ def _empty_bucket(bucket): }, } +def unravel_buckets(url, warcprox_meta): + ''' + Unravels bucket definitions in Warcprox-Meta header. Each bucket + definition can either be a string, which signifies the name of the + bucket, or a dict. If a dict it is expected to have at least an item + with key 'bucket' whose value is the name of the bucket. The other + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. + + Returns: + list of strings + + Example Warcprox-Meta header (a real one will likely have other + sections besides 'stats'): + + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + + In this case the return value would be + ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] + ''' + buckets = ["__all__"] + if (warcprox_meta and "stats" in warcprox_meta + and "buckets" in warcprox_meta["stats"]): + for bucket in warcprox_meta["stats"]["buckets"]: + if isinstance(bucket, dict): + if not 'bucket' in bucket: + self.logger.warn( + 'ignoring invalid stats bucket in ' + 'warcprox-meta header %s', bucket) + continue + buckets.append(bucket['bucket']) + if bucket.get('tally-domains'): + canon_url = urlcanon.semantic(url) + for domain in bucket['tally-domains']: + domain = urlcanon.normalize_host(domain).decode('ascii') + if urlcanon.url_matches_domain(canon_url, domain): + buckets.append( + '%s:%s' % (bucket['bucket'], domain)) + else: + buckets.append(bucket) + else: + buckets.append("__unspecified__") + + return buckets + class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): logger = logging.getLogger("warcprox.stats.StatsProcessor") @@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): return None def buckets(self, recorded_url): - ''' - Unravels bucket definitions in Warcprox-Meta header. Each bucket - definition can either be a string, which signifies the name of the - bucket, or a dict. If a dict it is expected to have at least an item - with key 'bucket' whose value is the name of the bucket. The other - currently recognized item is 'tally-domains', which if supplied should - be a list of domains. This instructs warcprox to additionally tally - substats of the given bucket by domain. Host stats are stored in the - stats table under the key '{parent-bucket}:{domain(normalized)}'. - - Example Warcprox-Meta header (a real one will likely have other - sections besides 'stats'): - - Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}} - ''' - buckets = ["__all__"] - if (recorded_url.warcprox_meta - and "stats" in recorded_url.warcprox_meta - and "buckets" in recorded_url.warcprox_meta["stats"]): - for bucket in recorded_url.warcprox_meta["stats"]["buckets"]: - if isinstance(bucket, dict): - if not 'bucket' in bucket: - self.logger.warn( - 'ignoring invalid stats bucket in ' - 'warcprox-meta header %s', bucket) - continue - buckets.append(bucket['bucket']) - if bucket.get('tally-domains'): - url = urlcanon.semantic(recorded_url.url) - for domain in bucket['tally-domains']: - domain = urlcanon.normalize_host(domain).decode('ascii') - if urlcanon.url_matches_domain(url, domain): - buckets.append( - '%s:%s' % (bucket['bucket'], domain)) - else: - buckets.append(bucket) - else: - buckets.append("__unspecified__") - - return buckets + return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta) class RethinkStatsProcessor(StatsProcessor): logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor") @@ -301,11 +309,9 @@ class RunningStats: need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10 if need_minute_snap: self.minute_snaps.append((now, self.urls, self.warc_bytes)) - logging.debug('added minute snap %r', self.minute_snaps[-1]) if need_ten_sec_snap: self.ten_sec_snaps.popleft() self.ten_sec_snaps.append((now, self.urls, self.warc_bytes)) - logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1]) def _closest_ten_sec_snap(self, t): # it's a deque so iterating over it is faster than indexed lookup diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 0d93e5c..417f450 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): block_rule = urlcanon.MatchRule(**rule) if block_rule.applies(url): body = ("request rejected by warcprox: blocked by " - "rule found in Warcprox-Meta header: %s" - % rule).encode("utf-8") + "rule found in Warcprox-Meta header: %s\n" + % json.dumps(rule)).encode("utf-8") self.send_response(403, "Forbidden") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) - response_meta = {"blocked-by-rule":rule} + response_meta = {"blocked-by-rule": rule} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) @@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], self.command, self.url, rule)) - def _enforce_limit(self, limit_key, limit_value, soft=False): + def _enforce_limit(self, buckets, limit_key, limit_value, soft=False): if not self.server.stats_db: return - bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) - _limit_key = limit_key - # if limit_key looks like 'job1:foo.com/total/urls' then we only want - # to apply this rule if the requested url is within domain - bucket0_fields = bucket0.split(':') - if len(bucket0_fields) == 2: - domain = urlcanon.normalize_host(bucket0_fields[1]) - if not urlcanon.host_matches_domain(self.hostname, domain): - return # else host matches, go ahead and enforce the limit - bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) - _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) + # parse limit key + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + # normalize domain if part of bucket + if ":" in bucket0: + b, raw_domain = bucket0.split(":", 1) + domain = urlcanon.normalize_host(raw_domain).decode("ascii") + bucket0 = "%s:%s" % (b, domain) + limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2) + + if not bucket0 in buckets: + return value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( - "soft limit" if soft else "limit", _limit_key, + "soft limit" if soft else "limit", limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") @@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: - response_meta["reached-soft-limit"] = {_limit_key:limit_value} + response_meta["reached-soft-limit"] = {limit_key:limit_value} else: - response_meta["reached-limit"] = {_limit_key:limit_value} + response_meta["reached-limit"] = {limit_key:limit_value} self.send_header( - "Warcprox-Meta", - json.dumps(response_meta, separators=(",",":"))) + "Warcprox-Meta", json.dumps(response_meta, separators=",:")) self.end_headers() if self.command != "HEAD": self.wfile.write(body) @@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", - _limit_key, limit_value)) + limit_key, limit_value)) def _enforce_limits(self, warcprox_meta): """ @@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is reached. """ + buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta) if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=False) + self._enforce_limit(buckets, limit_key, limit_value, soft=False) if warcprox_meta and "soft-limits" in warcprox_meta: for item in warcprox_meta["soft-limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=True) + self._enforce_limit(buckets, limit_key, limit_value, soft=True) def _security_check(self, warcprox_meta): '''