From efc51a43617eb791e9829dd5ecff00e8bbf6a946 Mon Sep 17 00:00:00 2001 From: Noah Levitt <nlevitt@archive.org> Date: Tue, 22 May 2018 11:59:06 -0700 Subject: [PATCH] stubby api docs --- api.rst | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++ readme.rst | 23 ++++++++--- 2 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 api.rst diff --git a/api.rst b/api.rst new file mode 100644 index 0000000..87b444f --- /dev/null +++ b/api.rst @@ -0,0 +1,110 @@ +warcprox API +************ + +Means of Interacting with warcprox over http, aside from simply proxying urls. + +`/status` url +============= + +If warcprox is running at localhost:8000, http://localhost:8000/status returns +a json blob with a bunch of status info. For example: + +:: + + $ curl -sS http://localhost:8000/status + { + "rates_5min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 277.2983281612396 + }, + "version": "2.4b2.dev174", + "load": 0.0, + "seconds_behind": 0.0, + "threads": 100, + "warc_bytes_written": 0, + "port": 8000, + "postfetch_chain": [ + { + "queued_urls": 0, + "processor": "SkipFacebookCaptchas" + }, + { + "queued_urls": 0, + "processor": "BatchTroughLoader" + }, + { + "queued_urls": 0, + "processor": "WarcWriterProcessor" + }, + { + "queued_urls": 0, + "processor": "BatchTroughStorer" + }, + { + "queued_urls": 0, + "processor": "RethinkStatsProcessor" + }, + { + "queued_urls": 0, + "processor": "CrawlLogger" + }, + { + "queued_urls": 0, + "processor": "TroughFeed" + }, + { + "queued_urls": 0, + "processor": "RunningStats" + } + ], + "queue_max_size": 500, + "role": "warcprox", + "queued_urls": 0, + "active_requests": 1, + "host": "wbgrp-svc405.us.archive.org", + "rates_15min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 876.9885368347168 + }, + "unaccepted_requests": 0, + "urls_processed": 0, + "pid": 18841, + "address": "127.0.0.1", + "rates_1min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 54.92501664161682 + }, + "start_time": 1526690353.4060142 + } + +`WARCPROX_WRITE_RECORD` http method +=================================== + +:: + + $ echo -ne 'WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1\r\nWARC-Type: resource\r\ncontent-type: text/plain;charset=utf-8\r\ncontent-length: 29\r\n\r\ni am a warc record payload!\r\n' | ncat 127.0.0.1 8000 + HTTP/1.0 204 OK + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Mon, 21 May 2018 23:33:31 GMT + +:: + + WARC/1.0 + WARC-Type: resource + WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5> + WARC-Date: 2018-05-21T23:33:31Z + WARC-Target-URI: special://url/some?thing + WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + Content-Type: text/plain;charset=utf-8 + Content-Length: 29 + + i am a warc record payload! + + +`Warcprox-Meta` http request header +=================================== + diff --git a/readme.rst b/readme.rst index 113099b..090130e 100644 --- a/readme.rst +++ b/readme.rst @@ -1,5 +1,5 @@ warcprox - WARC writing MITM HTTP/S proxy ------------------------------------------ +***************************************** .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master :target: https://travis-ci.org/internetarchive/warcprox @@ -7,7 +7,7 @@ Based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy Install -~~~~~~~ +======= Warcprox runs on python 3.4+. @@ -26,7 +26,7 @@ You can also install the latest bleeding edge code: Trusting the CA cert -~~~~~~~~~~~~~~~~~~~~ +==================== For best results while browsing through warcprox, you need to add the CA cert as a trusted cert in your browser. If you don't do that, you will @@ -34,8 +34,19 @@ get the warning when you visit each new site. But worse, any embedded https content on a different server will simply fail to load, because the browser will reject the certificate without telling you. +API +=== + +For interacting with a running instance of warcprox. + +* `/status` url +* `WARCPROX_WRITE_RECORD` http method +* `Warcprox-Meta` http request header + +See `<api.rst>`_. + Plugins -~~~~~~~ +======= Warcprox supports a limited notion of plugins by way of the `--plugin` command line argument. Plugin classes are loaded from the regular python module search @@ -49,7 +60,7 @@ specifying `--plugin` multiples times. `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__ Usage -~~~~~ +===== :: @@ -162,7 +173,7 @@ Usage -q, --quiet License -~~~~~~~ +======= Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also GPL.