mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Support for custom data being added via 'PUT /<coll>/record' when… (#661)
* add support for custom data being added via 'PUT /<coll>/record' when in recording mode and 'enable_put_custom_record: true' set in 'recorder' config - url specified via 'url' query arg and content type via request Content-Type - update docs for put custom record options * bump version to 2.6.0b4
This commit is contained in:
parent
a0faf904ef
commit
98c6fba44d
@ -266,6 +266,7 @@ The full set of configurable options (with their default settings) is as follows
|
|||||||
rollover_idle_secs: 600
|
rollover_idle_secs: 600
|
||||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||||
source_filter: live
|
source_filter: live
|
||||||
|
enable_put_custom_record: false
|
||||||
|
|
||||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||||
Most likely this will be the :ref:`live-web` collection, which should also be defined.
|
Most likely this will be the :ref:`live-web` collection, which should also be defined.
|
||||||
@ -341,6 +342,23 @@ When any dedup_policy, pywb can also access the dedup Redis index, along with an
|
|||||||
This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer.
|
This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer.
|
||||||
|
|
||||||
|
|
||||||
|
.. _put-custom-record:
|
||||||
|
|
||||||
|
Adding Custom Resource Records
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
pywb now also supports adding custom data to a WARC ``resource`` record. This can be used to add custom resources, such as screenshots, logs, error messages,
|
||||||
|
etc.. that are not normally captured as part of recording, but still useful to store in WARCs.
|
||||||
|
|
||||||
|
To add a custom resources, simply call ``PUT /<coll>/record`` with the data to be added as the request body and the type of the data specified as the content-type. The ``url`` can be specified as a query param.
|
||||||
|
|
||||||
|
For example, adding a custom record ``file:///my-custom-resource`` containing ``Some Custom Data`` can be done using ``curl`` as follows::
|
||||||
|
|
||||||
|
curl -XPUT "localhost:8080/my-web-archive/record?url=file:///my-custom-resource" --data "Some Custom Data"
|
||||||
|
|
||||||
|
|
||||||
|
This feature is only available if ``enable_put_custom_record: true`` is set in the recorder config.
|
||||||
|
|
||||||
|
|
||||||
.. _auto-fetch:
|
.. _auto-fetch:
|
||||||
|
|
||||||
|
@ -2,10 +2,10 @@ from gevent.monkey import patch_all; patch_all()
|
|||||||
|
|
||||||
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
||||||
from werkzeug.wsgi import pop_path_info
|
from werkzeug.wsgi import pop_path_info
|
||||||
from six.moves.urllib.parse import urljoin
|
from six.moves.urllib.parse import urljoin, parse_qsl
|
||||||
from six import iteritems
|
from six import iteritems
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
from warcio.timeutils import iso_date_to_timestamp
|
from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date
|
||||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||||
|
|
||||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||||
@ -74,6 +74,7 @@ class FrontEndApp(object):
|
|||||||
custom_config=custom_config)
|
custom_config=custom_config)
|
||||||
self.recorder = None
|
self.recorder = None
|
||||||
self.recorder_path = None
|
self.recorder_path = None
|
||||||
|
self.put_custom_record_path = None
|
||||||
self.proxy_default_timestamp = None
|
self.proxy_default_timestamp = None
|
||||||
|
|
||||||
config = self.warcserver.config
|
config = self.warcserver.config
|
||||||
@ -173,6 +174,10 @@ class FrontEndApp(object):
|
|||||||
if self.recorder_path:
|
if self.recorder_path:
|
||||||
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||||
|
|
||||||
|
# enable PUT of custom data as 'resource' records
|
||||||
|
if self.put_custom_record_path:
|
||||||
|
routes.append(Rule(coll_prefix + self.RECORD_ROUTE, endpoint=self.put_custom_record, methods=["PUT"]))
|
||||||
|
|
||||||
return routes
|
return routes
|
||||||
|
|
||||||
def get_upstream_paths(self, port):
|
def get_upstream_paths(self, port):
|
||||||
@ -259,6 +264,10 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
||||||
|
|
||||||
|
# enable PUT of custom data as 'resource' records
|
||||||
|
if recorder_config.get('enable_put_custom_record'):
|
||||||
|
self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
|
||||||
|
|
||||||
def init_autoindex(self, auto_interval):
|
def init_autoindex(self, auto_interval):
|
||||||
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
|
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
|
||||||
|
|
||||||
@ -476,6 +485,47 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
|
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
|
||||||
|
|
||||||
|
def put_custom_record(self, environ, coll="$root"):
|
||||||
|
""" When recording, PUT a custom WARC record to the specified collection
|
||||||
|
(Available only when recording)
|
||||||
|
|
||||||
|
:param dict environ: The WSGI environment dictionary for the request
|
||||||
|
:param str coll: The name of the collection the record is to be served from
|
||||||
|
"""
|
||||||
|
chunks = []
|
||||||
|
while True:
|
||||||
|
buff = environ["wsgi.input"].read()
|
||||||
|
if not buff:
|
||||||
|
break
|
||||||
|
|
||||||
|
chunks.append(buff)
|
||||||
|
|
||||||
|
data = b"".join(chunks)
|
||||||
|
|
||||||
|
params = dict(parse_qsl(environ.get("QUERY_STRING")))
|
||||||
|
|
||||||
|
rec_type = "resource"
|
||||||
|
|
||||||
|
headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}
|
||||||
|
|
||||||
|
target_uri = params.get("url")
|
||||||
|
|
||||||
|
if not target_uri:
|
||||||
|
return WbResponse.json_response({"error": "no url"}, status="400 Bad Request")
|
||||||
|
|
||||||
|
timestamp = params.get("timestamp")
|
||||||
|
if timestamp:
|
||||||
|
headers["WARC-Date"] = timestamp_to_iso_date(timestamp)
|
||||||
|
|
||||||
|
put_url = self.put_custom_record_path.format(
|
||||||
|
url=target_uri, coll=coll, rec_type=rec_type
|
||||||
|
)
|
||||||
|
res = requests.put(put_url, headers=headers, data=data)
|
||||||
|
|
||||||
|
res = res.json()
|
||||||
|
|
||||||
|
return WbResponse.json_response(res)
|
||||||
|
|
||||||
def setup_paths(self, environ, coll, record=False):
|
def setup_paths(self, environ, coll, record=False):
|
||||||
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
|
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
|
||||||
content or record.
|
content or record.
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = '2.6.0b3'
|
__version__ = '2.6.0b4'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
@ -2,7 +2,9 @@ debug: true
|
|||||||
|
|
||||||
collections_root: _test_colls
|
collections_root: _test_colls
|
||||||
|
|
||||||
recorder: live
|
recorder:
|
||||||
|
source_coll: live
|
||||||
|
enable_put_custom_record: true
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
'live': '$live'
|
'live': '$live'
|
||||||
|
@ -136,6 +136,18 @@ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
|||||||
assert to_path('collection="test2"') in link_lines[3]
|
assert to_path('collection="test2"') in link_lines[3]
|
||||||
#assert to_path('collection="test"') in link_lines[4]
|
#assert to_path('collection="test"') in link_lines[4]
|
||||||
|
|
||||||
|
def test_put_custom_record(self):
|
||||||
|
payload = b'<html><body>This is custom data added directly. <a href="/test">Link</a></body></html>'
|
||||||
|
res = self.testapp.put('/test2/record?url=https://example.com/custom/record', params=payload, content_type="text/html")
|
||||||
|
|
||||||
|
def test_replay_custom_record(self, fmod):
|
||||||
|
self.ensure_empty()
|
||||||
|
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test2/{0}https://example.com/custom/record', fmod_slash)
|
||||||
|
assert res.content_type == 'text/html'
|
||||||
|
assert 'This is custom data added directly. <a href="/test2/' in res.text
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user