1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Support for custom data being added via 'PUT /<coll>/record' when… (#661)

* add support for custom data being added via 'PUT /<coll>/record' when in recording mode and 'enable_put_custom_record: true' set in 'recorder' config
- url specified via 'url' query arg and content type via request Content-Type
- update docs for put custom record options

* bump version to 2.6.0b4
This commit is contained in:
Ilya Kreymer 2021-07-18 17:04:34 -07:00 committed by GitHub
parent a0faf904ef
commit 98c6fba44d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 86 additions and 4 deletions

View File

@ -266,6 +266,7 @@ The full set of configurable options (with their default settings) is as follows
rollover_idle_secs: 600
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
source_filter: live
enable_put_custom_record: false
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
Most likely this will be the :ref:`live-web` collection, which should also be defined.
@ -341,6 +342,23 @@ When any dedup_policy, pywb can also access the dedup Redis index, along with an
This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer.
.. _put-custom-record:
Adding Custom Resource Records
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
pywb now also supports adding custom data to a WARC ``resource`` record. This can be used to add custom resources, such as screenshots, logs, error messages,
etc.. that are not normally captured as part of recording, but still useful to store in WARCs.
To add a custom resources, simply call ``PUT /<coll>/record`` with the data to be added as the request body and the type of the data specified as the content-type. The ``url`` can be specified as a query param.
For example, adding a custom record ``file:///my-custom-resource`` containing ``Some Custom Data`` can be done using ``curl`` as follows::
curl -XPUT "localhost:8080/my-web-archive/record?url=file:///my-custom-resource" --data "Some Custom Data"
This feature is only available if ``enable_put_custom_record: true`` is set in the recorder config.
.. _auto-fetch:

View File

@ -2,10 +2,10 @@ from gevent.monkey import patch_all; patch_all()
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six.moves.urllib.parse import urljoin, parse_qsl
from six import iteritems
from warcio.utils import to_native_str
from warcio.timeutils import iso_date_to_timestamp
from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date
from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
@ -74,6 +74,7 @@ class FrontEndApp(object):
custom_config=custom_config)
self.recorder = None
self.recorder_path = None
self.put_custom_record_path = None
self.proxy_default_timestamp = None
config = self.warcserver.config
@ -173,6 +174,10 @@ class FrontEndApp(object):
if self.recorder_path:
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
# enable PUT of custom data as 'resource' records
if self.put_custom_record_path:
routes.append(Rule(coll_prefix + self.RECORD_ROUTE, endpoint=self.put_custom_record, methods=["PUT"]))
return routes
def get_upstream_paths(self, port):
@ -259,6 +264,10 @@ class FrontEndApp(object):
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
# enable PUT of custom data as 'resource' records
if recorder_config.get('enable_put_custom_record'):
self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
def init_autoindex(self, auto_interval):
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
@ -476,6 +485,47 @@ class FrontEndApp(object):
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
def put_custom_record(self, environ, coll="$root"):
""" When recording, PUT a custom WARC record to the specified collection
(Available only when recording)
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection the record is to be served from
"""
chunks = []
while True:
buff = environ["wsgi.input"].read()
if not buff:
break
chunks.append(buff)
data = b"".join(chunks)
params = dict(parse_qsl(environ.get("QUERY_STRING")))
rec_type = "resource"
headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}
target_uri = params.get("url")
if not target_uri:
return WbResponse.json_response({"error": "no url"}, status="400 Bad Request")
timestamp = params.get("timestamp")
if timestamp:
headers["WARC-Date"] = timestamp_to_iso_date(timestamp)
put_url = self.put_custom_record_path.format(
url=target_uri, coll=coll, rec_type=rec_type
)
res = requests.put(put_url, headers=headers, data=data)
res = res.json()
return WbResponse.json_response(res)
def setup_paths(self, environ, coll, record=False):
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
content or record.

View File

@ -1,4 +1,4 @@
__version__ = '2.6.0b3'
__version__ = '2.6.0b4'
if __name__ == '__main__':
print(__version__)

View File

@ -2,7 +2,9 @@ debug: true
collections_root: _test_colls
recorder: live
recorder:
source_coll: live
enable_put_custom_record: true
collections:
'live': '$live'

View File

@ -136,6 +136,18 @@ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
assert to_path('collection="test2"') in link_lines[3]
#assert to_path('collection="test"') in link_lines[4]
def test_put_custom_record(self):
payload = b'<html><body>This is custom data added directly. <a href="/test">Link</a></body></html>'
res = self.testapp.put('/test2/record?url=https://example.com/custom/record', params=payload, content_type="text/html")
def test_replay_custom_record(self, fmod):
self.ensure_empty()
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test2/{0}https://example.com/custom/record', fmod_slash)
assert res.content_type == 'text/html'
assert 'This is custom data added directly. <a href="/test2/' in res.text
# ============================================================================
class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):