diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 763996c1..666a5346 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -266,6 +266,7 @@ The full set of configurable options (with their default settings) is as follows rollover_idle_secs: 600 filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz source_filter: live + enable_put_custom_record: false The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded. Most likely this will be the :ref:`live-web` collection, which should also be defined. @@ -341,6 +342,23 @@ When any dedup_policy, pywb can also access the dedup Redis index, along with an This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer. +.. _put-custom-record: + +Adding Custom Resource Records +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pywb now also supports adding custom data to a WARC ``resource`` record. This can be used to add custom resources, such as screenshots, logs, error messages, +etc.. that are not normally captured as part of recording, but still useful to store in WARCs. + +To add a custom resources, simply call ``PUT //record`` with the data to be added as the request body and the type of the data specified as the content-type. The ``url`` can be specified as a query param. + +For example, adding a custom record ``file:///my-custom-resource`` containing ``Some Custom Data`` can be done using ``curl`` as follows:: + + curl -XPUT "localhost:8080/my-web-archive/record?url=file:///my-custom-resource" --data "Some Custom Data" + + +This feature is only available if ``enable_put_custom_record: true`` is set in the recorder config. + .. _auto-fetch: diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 25d5e4ab..3ab74bfc 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -2,10 +2,10 @@ from gevent.monkey import patch_all; patch_all() from werkzeug.routing import Map, Rule, RequestRedirect, Submount from werkzeug.wsgi import pop_path_info -from six.moves.urllib.parse import urljoin +from six.moves.urllib.parse import urljoin, parse_qsl from six import iteritems from warcio.utils import to_native_str -from warcio.timeutils import iso_date_to_timestamp +from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date from wsgiprox.wsgiprox import WSGIProxMiddleware from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter @@ -74,6 +74,7 @@ class FrontEndApp(object): custom_config=custom_config) self.recorder = None self.recorder_path = None + self.put_custom_record_path = None self.proxy_default_timestamp = None config = self.warcserver.config @@ -173,6 +174,10 @@ class FrontEndApp(object): if self.recorder_path: routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) + # enable PUT of custom data as 'resource' records + if self.put_custom_record_path: + routes.append(Rule(coll_prefix + self.RECORD_ROUTE, endpoint=self.put_custom_record, methods=["PUT"])) + return routes def get_upstream_paths(self, port): @@ -259,6 +264,10 @@ class FrontEndApp(object): self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) + # enable PUT of custom data as 'resource' records + if recorder_config.get('enable_put_custom_record'): + self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}' + def init_autoindex(self, auto_interval): """Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op. @@ -476,6 +485,47 @@ class FrontEndApp(object): return self.rewriterapp.render_content(wb_url_str, coll_config, environ) + def put_custom_record(self, environ, coll="$root"): + """ When recording, PUT a custom WARC record to the specified collection + (Available only when recording) + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection the record is to be served from + """ + chunks = [] + while True: + buff = environ["wsgi.input"].read() + if not buff: + break + + chunks.append(buff) + + data = b"".join(chunks) + + params = dict(parse_qsl(environ.get("QUERY_STRING"))) + + rec_type = "resource" + + headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")} + + target_uri = params.get("url") + + if not target_uri: + return WbResponse.json_response({"error": "no url"}, status="400 Bad Request") + + timestamp = params.get("timestamp") + if timestamp: + headers["WARC-Date"] = timestamp_to_iso_date(timestamp) + + put_url = self.put_custom_record_path.format( + url=target_uri, coll=coll, rec_type=rec_type + ) + res = requests.put(put_url, headers=headers, data=data) + + res = res.json() + + return WbResponse.json_response(res) + def setup_paths(self, environ, coll, record=False): """Populates the WSGI environment dictionary with the path information necessary to perform a response for content or record. diff --git a/pywb/version.py b/pywb/version.py index 6d0f7edb..b84e19c4 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.6.0b3' +__version__ = '2.6.0b4' if __name__ == '__main__': print(__version__) diff --git a/tests/config_test_record.yaml b/tests/config_test_record.yaml index 9f4050a4..347a8887 100644 --- a/tests/config_test_record.yaml +++ b/tests/config_test_record.yaml @@ -2,7 +2,9 @@ debug: true collections_root: _test_colls -recorder: live +recorder: + source_coll: live + enable_put_custom_record: true collections: 'live': '$live' diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index 7b510dbc..c0f37392 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -136,6 +136,18 @@ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): assert to_path('collection="test2"') in link_lines[3] #assert to_path('collection="test"') in link_lines[4] + def test_put_custom_record(self): + payload = b'This is custom data added directly. Link' + res = self.testapp.put('/test2/record?url=https://example.com/custom/record', params=payload, content_type="text/html") + + def test_replay_custom_record(self, fmod): + self.ensure_empty() + + fmod_slash = fmod + '/' if fmod else '' + res = self.get('/test2/{0}https://example.com/custom/record', fmod_slash) + assert res.content_type == 'text/html' + assert 'This is custom data added directly.