From 455efb17ad5f1679dbbf32e5c670c5c9510e5d57 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 11 Mar 2019 16:28:09 -0700 Subject: [PATCH] Support for default timestamp/date for proxy mode (#454) * proxy: add option to set default timestamp for proxy mode, fixes #452 - set via flag --proxy-default-timestamp or config 'proxy_options.default_timestamp' - can be iso date or all-digit timestamp - overridable via accept-datetime header * docs: update docs for proxy timestamp - add docs on memento support in proxy mode * update-version: script can update version only, commit with 'update-version.sh commit' * indexer post append: remove 'WB_wombat_' from POST query, could have been added in previous versions of pywb! --- .gitignore | 2 ++ CHANGES.rst | 7 +++++++ docs/manual/configuring.rst | 20 ++++++++++++++++++-- docs/manual/memento.rst | 13 +++++++++++++ pywb/apps/cli.py | 5 ++++- pywb/apps/frontendapp.py | 16 ++++++++++++++++ pywb/apps/rewriterapp.py | 4 ++++ pywb/indexer/archiveindexer.py | 1 + pywb/version.py | 2 +- tests/test_cli.py | 22 ++++++++++++++++++---- tests/test_proxy.py | 31 +++++++++++++++++++++++++++++++ update-version.sh | 21 ++++++++++++++++----- 12 files changed, 131 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index cd0bc5c7..7be6b0b8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,8 @@ __pycache__ # ignore auto-gen certs ca/pywb-ca.pem ca/certs/ +proxy-certs/ +collections/ # Installer logs pip-log.txt diff --git a/CHANGES.rst b/CHANGES.rst index d6aa8ddc..8da42496 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,10 @@ +pywb 2.2.20190311 changelist +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Support for setting timestamp in proxy mode via ``--proxy-timestamp`` (fixes #352) +* Remove any ``WB_wombat_`` found in POST requests from old versions of pywb. + + pywb 2.2.x changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index f6463260..8cbfd308 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -429,10 +429,26 @@ To enable proxy mode, the collection can be specified by running: ``wayback --pr proxy: coll: my-coll - + For HTTP proxy access, this is all that is needed to use the proxy. If pywb is running on port 8080 on localhost, the following curl command should provide proxy access: ``curl -x "localhost:8080" http://example.com/`` +Default Proxy Timestamp +^^^^^^^^^^^^^^^^^^^^^^^ + +The timestamp can also be optionally specified by running: ``wayback --proxy my-coll --proxy-default-timestamp 20181226010203`` or by specifying the config:: + + proxy: + coll: my-coll + default-timestamp: 20181226010203 + +The ISO date format, eg. ``2018-12-26T01:02:03`` is also accepted. + +If the timestamp is omitted, proxy mode replay defaults to the latest capture. + +The timestamp can also be dynamically overriden per-request using the :ref:`memento-proxy`. + + Proxy Mode Rewriting ^^^^^^^^^^^^^^^^^^^^ @@ -466,7 +482,6 @@ If omitted, the defaults for these options are:: enable_wombat: false enable_content_rewrite: true - For example, to enable wombat rewriting but disable the banner, use the config:: proxy: @@ -530,6 +545,7 @@ The following are all the available proxy options -- only ``coll`` is required:: recording: false enable_banner: true enable_content_rewrite: true + default_timestamp: '' The HTTP/S functionality is provided by the separate :mod:`wsgiprox` utility which provides HTTP/S proxy routing to any WSGI application. diff --git a/docs/manual/memento.rst b/docs/manual/memento.rst index 541a8453..d9087b76 100644 --- a/docs/manual/memento.rst +++ b/docs/manual/memento.rst @@ -67,6 +67,19 @@ with the redirect. As this approach always includes a redirect, use of this system is discouraged when the intent is to render mementos. However, this approach is useful when the goal is to determine the URI-M and to provide backwards compatibility. +.. _memento-proxy: +Proxy Mode Memento API +^^^^^^^^^^^^^^^^^^^^^^ + +When running in :ref:`https-proxy`, pywb behaves roughly in accordance with `Memento Pattern 1.3 `_ + +Every URI in proxy mode is also a TimeGate, and the ``Accept-Datetime`` header can be used to specify which timestamp to use in proxy mode. +The ``Accept-Datetime`` header overrides any other timestamp setting in proxy mode. + +The main distinction from the standard is that the URI-R, the original resource, is not available in proxy mode. (It is simply the URL loaded without the proxy, +which is not possible to specify via the URL alone). + + URI-M Headers ------------- diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 031afe55..89b25a51 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -56,6 +56,8 @@ class BaseCli(object): help='Enable recording from the live web') parser.add_argument('--proxy', help='Enable HTTP/S proxy on specified collection') + parser.add_argument('-pt', '--proxy-default-timestamp', + help='Default timestamp / ISO date to use for proxy requests') parser.add_argument('--proxy-record', action='store_true', help='Enable proxy recording into specified collection') parser.add_argument('--proxy-enable-wombat', action='store_true', @@ -76,7 +78,8 @@ class BaseCli(object): self.extra_config['proxy'] = { 'coll': self.r.proxy, 'recording': self.r.proxy_record, - 'enable_wombat': self.r.proxy_enable_wombat + 'enable_wombat': self.r.proxy_enable_wombat, + 'default_timestamp': self.r.proxy_default_timestamp, } self.r.live = True diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 33640133..9626a07f 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin from six import iteritems from warcio.statusandheaders import StatusAndHeaders from warcio.utils import to_native_str +from warcio.timeutils import iso_date_to_timestamp from wsgiprox.wsgiprox import WSGIProxMiddleware from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter @@ -26,6 +27,8 @@ from pywb.apps.rewriterapp import RewriterApp, UpstreamException from pywb.apps.wbrequestresponse import WbResponse import os +import re + import traceback import requests import logging @@ -54,6 +57,8 @@ class FrontEndApp(object): PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem') + ALL_DIGITS = re.compile(r'^\d+$') + def __init__(self, config_file='./config.yaml', custom_config=None): """ :param str config_file: Path to the config file @@ -559,6 +564,14 @@ class FrontEndApp(object): else: self.proxy_prefix = '/{0}/id_/'.format(proxy_coll) + self.proxy_default_timestamp = proxy_config.get('default_timestamp') + if self.proxy_default_timestamp: + if not self.ALL_DIGITS.match(self.proxy_default_timestamp): + try: + self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp) + except: + raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format') + self.proxy_coll = proxy_coll self.handler = WSGIProxMiddleware(self.handle_request, @@ -572,6 +585,9 @@ class FrontEndApp(object): Default is to use the 'proxy_prefix' to point to the proxy collection """ + if self.proxy_default_timestamp: + environ['pywb_proxy_default_timestamp'] = self.proxy_default_timestamp + return self.proxy_prefix + url def proxy_fetch(self, env, url): diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 1221ba9a..6e3d375f 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -135,6 +135,10 @@ class RewriterApp(object): wb_url.type = wb_url.REPLAY + elif 'pywb_proxy_default_timestamp' in environ: + wb_url.timestamp = environ['pywb_proxy_default_timestamp'] + wb_url.type = wb_url.REPLAY + return is_timegate def _check_range(self, inputreq, wb_url): diff --git a/pywb/indexer/archiveindexer.py b/pywb/indexer/archiveindexer.py index 8c2fbbc0..668a6c9f 100644 --- a/pywb/indexer/archiveindexer.py +++ b/pywb/indexer/archiveindexer.py @@ -70,6 +70,7 @@ class ArchiveIndexEntryMixin(object): post_query = other.get('_post_query') url = self['url'] new_url = post_query.append_query(url) + new_url = new_url.replace('WB_wombat_', '') if post_query and new_url != url: self['urlkey'] = canonicalize(new_url, surt_ordered) other['urlkey'] = self['urlkey'] diff --git a/pywb/version.py b/pywb/version.py index 71b60c94..b53dac8e 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1 +1 @@ -__version__ = '2.2.20190227' +__version__ = '2.2.20190310' diff --git a/tests/test_cli.py b/tests/test_cli.py index 451f401b..47b43221 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,12 +20,22 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass): 'ca_name': 'pywb HTTPS Proxy CA', 'coll': 'test', 'recording': False, - 'enable_wombat': False} + 'enable_wombat': False, + 'default_timestamp': None + } assert res.extra_config['proxy'] == exp - def test_auto_fetch_cli(self): - res = wayback(['--enable-auto-fetch']) - assert res.extra_config['enable_auto_fetch'] == True + def test_proxy_cli_ts_iso_date(self): + res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '2014-01-03 00:01:02']) + assert res.application.proxy_default_timestamp == '20140103000102' + + def test_proxy_cli_ts(self): + res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '20140103000102']) + assert res.application.proxy_default_timestamp == '20140103000102' + + def test_proxy_cli_ts_err_invalid_ts(self): + with pytest.raises(Exception): + res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '2014abc']) def test_proxy_cli_rec(self): res = wayback(['--proxy', 'test', '--proxy-record']) @@ -36,6 +46,10 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass): with pytest.raises(Exception): res = wayback(['--proxy', 'test/foo']) + def test_auto_fetch_cli(self): + res = wayback(['--enable-auto-fetch']) + assert res.extra_config['enable_auto_fetch'] == True + def test_all_cli(self): res = wayback(['--all-coll', 'all']) assert res.extra_config['collections']['all'] == '$all' diff --git a/tests/test_proxy.py b/tests/test_proxy.py index d0826404..fc925b9a 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -109,6 +109,37 @@ class TestProxy(BaseTestProxy): assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT' +# ============================================================================ +class TestProxyDefaultDate(BaseTestProxy): + @classmethod + def setup_class(cls): + super(TestProxyDefaultDate, cls).setup_class(proxy_opts={'default_timestamp': '20111226181251'}) + + def test_proxy_default_replay_dt(self, scheme): + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + verify=self.root_ca_file) + + assert 'WB Insert' in res.text + assert 'Example Domain' in res.text + + # no wombat.js and wombatProxyMode.js + assert 'wombat.js' not in res.text + assert 'wombatProxyMode.js' not in res.text + + # no auto fetch + assert 'wbinfo.enable_auto_fetch = false;' in res.text + + # banner + assert 'default_banner.js' in res.text + + # no redirect check + assert 'window == window.top' not in res.text + + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 29 Jul 2013 19:51:51 GMT"; collection="pywb"' + assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT' + + # ============================================================================ class TestRecordingProxy(HttpBinLiveTests, CollsDirMixin, BaseTestProxy): @classmethod diff --git a/update-version.sh b/update-version.sh index 8a9b7063..c29ef02e 100755 --- a/update-version.sh +++ b/update-version.sh @@ -4,13 +4,24 @@ BASE=2.2 NOW=$(date +%Y%m%d) -TAG="$BASE.$NOW" +VERSION="$BASE.$NOW" # Update -sed -i='' -E "s/(__version__ = ').*$/\1$TAG'/" ./pywb/version.py -git commit -m "version: update to $TAG" ./pywb/version.py +echo "Updating version to $VERSION" +sed -i='' -E "s/(__version__ = ').*$/\1$VERSION'/" ./pywb/version.py + +if [ "$1" != "commit" ]; then + exit 0 +fi + +TAG=v-$VERSION + +echo "Committing Tag $TAG" + +git commit -m "version: update to $VERSION" ./pywb/version.py git push # Tag -git tag v-$TAG -git push origin v-$TAG +git tag $TAG +git push origin $TAG +