mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Support for custom data being added via 'PUT /<coll>/record' when… (#661)
* add support for custom data being added via 'PUT /<coll>/record' when in recording mode and 'enable_put_custom_record: true' set in 'recorder' config - url specified via 'url' query arg and content type via request Content-Type - update docs for put custom record options * bump version to 2.6.0b4
This commit is contained in:
parent
a0faf904ef
commit
98c6fba44d
@ -266,6 +266,7 @@ The full set of configurable options (with their default settings) is as follows
|
||||
rollover_idle_secs: 600
|
||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||
source_filter: live
|
||||
enable_put_custom_record: false
|
||||
|
||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||
Most likely this will be the :ref:`live-web` collection, which should also be defined.
|
||||
@ -341,6 +342,23 @@ When any dedup_policy, pywb can also access the dedup Redis index, along with an
|
||||
This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer.
|
||||
|
||||
|
||||
.. _put-custom-record:
|
||||
|
||||
Adding Custom Resource Records
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
pywb now also supports adding custom data to a WARC ``resource`` record. This can be used to add custom resources, such as screenshots, logs, error messages,
|
||||
etc.. that are not normally captured as part of recording, but still useful to store in WARCs.
|
||||
|
||||
To add a custom resources, simply call ``PUT /<coll>/record`` with the data to be added as the request body and the type of the data specified as the content-type. The ``url`` can be specified as a query param.
|
||||
|
||||
For example, adding a custom record ``file:///my-custom-resource`` containing ``Some Custom Data`` can be done using ``curl`` as follows::
|
||||
|
||||
curl -XPUT "localhost:8080/my-web-archive/record?url=file:///my-custom-resource" --data "Some Custom Data"
|
||||
|
||||
|
||||
This feature is only available if ``enable_put_custom_record: true`` is set in the recorder config.
|
||||
|
||||
|
||||
.. _auto-fetch:
|
||||
|
||||
|
@ -2,10 +2,10 @@ from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
||||
from werkzeug.wsgi import pop_path_info
|
||||
from six.moves.urllib.parse import urljoin
|
||||
from six.moves.urllib.parse import urljoin, parse_qsl
|
||||
from six import iteritems
|
||||
from warcio.utils import to_native_str
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date
|
||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
@ -74,6 +74,7 @@ class FrontEndApp(object):
|
||||
custom_config=custom_config)
|
||||
self.recorder = None
|
||||
self.recorder_path = None
|
||||
self.put_custom_record_path = None
|
||||
self.proxy_default_timestamp = None
|
||||
|
||||
config = self.warcserver.config
|
||||
@ -173,6 +174,10 @@ class FrontEndApp(object):
|
||||
if self.recorder_path:
|
||||
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||
|
||||
# enable PUT of custom data as 'resource' records
|
||||
if self.put_custom_record_path:
|
||||
routes.append(Rule(coll_prefix + self.RECORD_ROUTE, endpoint=self.put_custom_record, methods=["PUT"]))
|
||||
|
||||
return routes
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
@ -259,6 +264,10 @@ class FrontEndApp(object):
|
||||
|
||||
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
||||
|
||||
# enable PUT of custom data as 'resource' records
|
||||
if recorder_config.get('enable_put_custom_record'):
|
||||
self.put_custom_record_path = self.recorder_path + '&put_record={rec_type}&url={url}'
|
||||
|
||||
def init_autoindex(self, auto_interval):
|
||||
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
|
||||
|
||||
@ -476,6 +485,47 @@ class FrontEndApp(object):
|
||||
|
||||
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
|
||||
|
||||
def put_custom_record(self, environ, coll="$root"):
|
||||
""" When recording, PUT a custom WARC record to the specified collection
|
||||
(Available only when recording)
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param str coll: The name of the collection the record is to be served from
|
||||
"""
|
||||
chunks = []
|
||||
while True:
|
||||
buff = environ["wsgi.input"].read()
|
||||
if not buff:
|
||||
break
|
||||
|
||||
chunks.append(buff)
|
||||
|
||||
data = b"".join(chunks)
|
||||
|
||||
params = dict(parse_qsl(environ.get("QUERY_STRING")))
|
||||
|
||||
rec_type = "resource"
|
||||
|
||||
headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}
|
||||
|
||||
target_uri = params.get("url")
|
||||
|
||||
if not target_uri:
|
||||
return WbResponse.json_response({"error": "no url"}, status="400 Bad Request")
|
||||
|
||||
timestamp = params.get("timestamp")
|
||||
if timestamp:
|
||||
headers["WARC-Date"] = timestamp_to_iso_date(timestamp)
|
||||
|
||||
put_url = self.put_custom_record_path.format(
|
||||
url=target_uri, coll=coll, rec_type=rec_type
|
||||
)
|
||||
res = requests.put(put_url, headers=headers, data=data)
|
||||
|
||||
res = res.json()
|
||||
|
||||
return WbResponse.json_response(res)
|
||||
|
||||
def setup_paths(self, environ, coll, record=False):
|
||||
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
|
||||
content or record.
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = '2.6.0b3'
|
||||
__version__ = '2.6.0b4'
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
@ -2,7 +2,9 @@ debug: true
|
||||
|
||||
collections_root: _test_colls
|
||||
|
||||
recorder: live
|
||||
recorder:
|
||||
source_coll: live
|
||||
enable_put_custom_record: true
|
||||
|
||||
collections:
|
||||
'live': '$live'
|
||||
|
@ -136,6 +136,18 @@ class TestRecordReplay(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||
assert to_path('collection="test2"') in link_lines[3]
|
||||
#assert to_path('collection="test"') in link_lines[4]
|
||||
|
||||
def test_put_custom_record(self):
|
||||
payload = b'<html><body>This is custom data added directly. <a href="/test">Link</a></body></html>'
|
||||
res = self.testapp.put('/test2/record?url=https://example.com/custom/record', params=payload, content_type="text/html")
|
||||
|
||||
def test_replay_custom_record(self, fmod):
|
||||
self.ensure_empty()
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test2/{0}https://example.com/custom/record', fmod_slash)
|
||||
assert res.content_type == 'text/html'
|
||||
assert 'This is custom data added directly. <a href="/test2/' in res.text
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||
|
Loading…
x
Reference in New Issue
Block a user