mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Support for default timestamp/date for proxy mode (#454)
* proxy: add option to set default timestamp for proxy mode, fixes #452 - set via flag --proxy-default-timestamp or config 'proxy_options.default_timestamp' - can be iso date or all-digit timestamp - overridable via accept-datetime header * docs: update docs for proxy timestamp - add docs on memento support in proxy mode * update-version: script can update version only, commit with 'update-version.sh commit' * indexer post append: remove 'WB_wombat_' from POST query, could have been added in previous versions of pywb!
This commit is contained in:
parent
4b5c397992
commit
455efb17ad
2
.gitignore
vendored
2
.gitignore
vendored
@ -24,6 +24,8 @@ __pycache__
|
|||||||
# ignore auto-gen certs
|
# ignore auto-gen certs
|
||||||
ca/pywb-ca.pem
|
ca/pywb-ca.pem
|
||||||
ca/certs/
|
ca/certs/
|
||||||
|
proxy-certs/
|
||||||
|
collections/
|
||||||
|
|
||||||
# Installer logs
|
# Installer logs
|
||||||
pip-log.txt
|
pip-log.txt
|
||||||
|
@ -1,3 +1,10 @@
|
|||||||
|
pywb 2.2.20190311 changelist
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Support for setting timestamp in proxy mode via ``--proxy-timestamp`` (fixes #352)
|
||||||
|
* Remove any ``WB_wombat_`` found in POST requests from old versions of pywb.
|
||||||
|
|
||||||
|
|
||||||
pywb 2.2.x changelist
|
pywb 2.2.x changelist
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
@ -433,6 +433,22 @@ To enable proxy mode, the collection can be specified by running: ``wayback --pr
|
|||||||
For HTTP proxy access, this is all that is needed to use the proxy. If pywb is running on port 8080 on localhost, the following curl command should provide proxy access: ``curl -x "localhost:8080" http://example.com/``
|
For HTTP proxy access, this is all that is needed to use the proxy. If pywb is running on port 8080 on localhost, the following curl command should provide proxy access: ``curl -x "localhost:8080" http://example.com/``
|
||||||
|
|
||||||
|
|
||||||
|
Default Proxy Timestamp
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The timestamp can also be optionally specified by running: ``wayback --proxy my-coll --proxy-default-timestamp 20181226010203`` or by specifying the config::
|
||||||
|
|
||||||
|
proxy:
|
||||||
|
coll: my-coll
|
||||||
|
default-timestamp: 20181226010203
|
||||||
|
|
||||||
|
The ISO date format, eg. ``2018-12-26T01:02:03`` is also accepted.
|
||||||
|
|
||||||
|
If the timestamp is omitted, proxy mode replay defaults to the latest capture.
|
||||||
|
|
||||||
|
The timestamp can also be dynamically overriden per-request using the :ref:`memento-proxy`.
|
||||||
|
|
||||||
|
|
||||||
Proxy Mode Rewriting
|
Proxy Mode Rewriting
|
||||||
^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
@ -466,7 +482,6 @@ If omitted, the defaults for these options are::
|
|||||||
enable_wombat: false
|
enable_wombat: false
|
||||||
enable_content_rewrite: true
|
enable_content_rewrite: true
|
||||||
|
|
||||||
|
|
||||||
For example, to enable wombat rewriting but disable the banner, use the config::
|
For example, to enable wombat rewriting but disable the banner, use the config::
|
||||||
|
|
||||||
proxy:
|
proxy:
|
||||||
@ -530,6 +545,7 @@ The following are all the available proxy options -- only ``coll`` is required::
|
|||||||
recording: false
|
recording: false
|
||||||
enable_banner: true
|
enable_banner: true
|
||||||
enable_content_rewrite: true
|
enable_content_rewrite: true
|
||||||
|
default_timestamp: ''
|
||||||
|
|
||||||
The HTTP/S functionality is provided by the separate :mod:`wsgiprox` utility which provides HTTP/S proxy routing
|
The HTTP/S functionality is provided by the separate :mod:`wsgiprox` utility which provides HTTP/S proxy routing
|
||||||
to any WSGI application.
|
to any WSGI application.
|
||||||
|
@ -67,6 +67,19 @@ with the redirect.
|
|||||||
As this approach always includes a redirect, use of this system is discouraged when the intent is to render mementos. However, this approach is useful when the goal is to determine the URI-M and to provide backwards compatibility.
|
As this approach always includes a redirect, use of this system is discouraged when the intent is to render mementos. However, this approach is useful when the goal is to determine the URI-M and to provide backwards compatibility.
|
||||||
|
|
||||||
|
|
||||||
|
.. _memento-proxy:
|
||||||
|
Proxy Mode Memento API
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When running in :ref:`https-proxy`, pywb behaves roughly in accordance with `Memento Pattern 1.3 <https://tools.ietf.org/html/rfc7089#section-4.1.3>`_
|
||||||
|
|
||||||
|
Every URI in proxy mode is also a TimeGate, and the ``Accept-Datetime`` header can be used to specify which timestamp to use in proxy mode.
|
||||||
|
The ``Accept-Datetime`` header overrides any other timestamp setting in proxy mode.
|
||||||
|
|
||||||
|
The main distinction from the standard is that the URI-R, the original resource, is not available in proxy mode. (It is simply the URL loaded without the proxy,
|
||||||
|
which is not possible to specify via the URL alone).
|
||||||
|
|
||||||
|
|
||||||
URI-M Headers
|
URI-M Headers
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
@ -56,6 +56,8 @@ class BaseCli(object):
|
|||||||
help='Enable recording from the live web')
|
help='Enable recording from the live web')
|
||||||
parser.add_argument('--proxy',
|
parser.add_argument('--proxy',
|
||||||
help='Enable HTTP/S proxy on specified collection')
|
help='Enable HTTP/S proxy on specified collection')
|
||||||
|
parser.add_argument('-pt', '--proxy-default-timestamp',
|
||||||
|
help='Default timestamp / ISO date to use for proxy requests')
|
||||||
parser.add_argument('--proxy-record', action='store_true',
|
parser.add_argument('--proxy-record', action='store_true',
|
||||||
help='Enable proxy recording into specified collection')
|
help='Enable proxy recording into specified collection')
|
||||||
parser.add_argument('--proxy-enable-wombat', action='store_true',
|
parser.add_argument('--proxy-enable-wombat', action='store_true',
|
||||||
@ -76,7 +78,8 @@ class BaseCli(object):
|
|||||||
self.extra_config['proxy'] = {
|
self.extra_config['proxy'] = {
|
||||||
'coll': self.r.proxy,
|
'coll': self.r.proxy,
|
||||||
'recording': self.r.proxy_record,
|
'recording': self.r.proxy_record,
|
||||||
'enable_wombat': self.r.proxy_enable_wombat
|
'enable_wombat': self.r.proxy_enable_wombat,
|
||||||
|
'default_timestamp': self.r.proxy_default_timestamp,
|
||||||
}
|
}
|
||||||
|
|
||||||
self.r.live = True
|
self.r.live = True
|
||||||
|
@ -8,6 +8,7 @@ from six.moves.urllib.parse import urljoin
|
|||||||
from six import iteritems
|
from six import iteritems
|
||||||
from warcio.statusandheaders import StatusAndHeaders
|
from warcio.statusandheaders import StatusAndHeaders
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
from warcio.timeutils import iso_date_to_timestamp
|
||||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||||
|
|
||||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||||
@ -26,6 +27,8 @@ from pywb.apps.rewriterapp import RewriterApp, UpstreamException
|
|||||||
from pywb.apps.wbrequestresponse import WbResponse
|
from pywb.apps.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
@ -54,6 +57,8 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
|
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
|
||||||
|
|
||||||
|
ALL_DIGITS = re.compile(r'^\d+$')
|
||||||
|
|
||||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||||
"""
|
"""
|
||||||
:param str config_file: Path to the config file
|
:param str config_file: Path to the config file
|
||||||
@ -559,6 +564,14 @@ class FrontEndApp(object):
|
|||||||
else:
|
else:
|
||||||
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
|
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
|
||||||
|
|
||||||
|
self.proxy_default_timestamp = proxy_config.get('default_timestamp')
|
||||||
|
if self.proxy_default_timestamp:
|
||||||
|
if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
|
||||||
|
try:
|
||||||
|
self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp)
|
||||||
|
except:
|
||||||
|
raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format')
|
||||||
|
|
||||||
self.proxy_coll = proxy_coll
|
self.proxy_coll = proxy_coll
|
||||||
|
|
||||||
self.handler = WSGIProxMiddleware(self.handle_request,
|
self.handler = WSGIProxMiddleware(self.handle_request,
|
||||||
@ -572,6 +585,9 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
Default is to use the 'proxy_prefix' to point to the proxy collection
|
Default is to use the 'proxy_prefix' to point to the proxy collection
|
||||||
"""
|
"""
|
||||||
|
if self.proxy_default_timestamp:
|
||||||
|
environ['pywb_proxy_default_timestamp'] = self.proxy_default_timestamp
|
||||||
|
|
||||||
return self.proxy_prefix + url
|
return self.proxy_prefix + url
|
||||||
|
|
||||||
def proxy_fetch(self, env, url):
|
def proxy_fetch(self, env, url):
|
||||||
|
@ -135,6 +135,10 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
wb_url.type = wb_url.REPLAY
|
wb_url.type = wb_url.REPLAY
|
||||||
|
|
||||||
|
elif 'pywb_proxy_default_timestamp' in environ:
|
||||||
|
wb_url.timestamp = environ['pywb_proxy_default_timestamp']
|
||||||
|
wb_url.type = wb_url.REPLAY
|
||||||
|
|
||||||
return is_timegate
|
return is_timegate
|
||||||
|
|
||||||
def _check_range(self, inputreq, wb_url):
|
def _check_range(self, inputreq, wb_url):
|
||||||
|
@ -70,6 +70,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
post_query = other.get('_post_query')
|
post_query = other.get('_post_query')
|
||||||
url = self['url']
|
url = self['url']
|
||||||
new_url = post_query.append_query(url)
|
new_url = post_query.append_query(url)
|
||||||
|
new_url = new_url.replace('WB_wombat_', '')
|
||||||
if post_query and new_url != url:
|
if post_query and new_url != url:
|
||||||
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
self['urlkey'] = canonicalize(new_url, surt_ordered)
|
||||||
other['urlkey'] = self['urlkey']
|
other['urlkey'] = self['urlkey']
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = '2.2.20190227'
|
__version__ = '2.2.20190310'
|
||||||
|
@ -20,12 +20,22 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
|||||||
'ca_name': 'pywb HTTPS Proxy CA',
|
'ca_name': 'pywb HTTPS Proxy CA',
|
||||||
'coll': 'test',
|
'coll': 'test',
|
||||||
'recording': False,
|
'recording': False,
|
||||||
'enable_wombat': False}
|
'enable_wombat': False,
|
||||||
|
'default_timestamp': None
|
||||||
|
}
|
||||||
assert res.extra_config['proxy'] == exp
|
assert res.extra_config['proxy'] == exp
|
||||||
|
|
||||||
def test_auto_fetch_cli(self):
|
def test_proxy_cli_ts_iso_date(self):
|
||||||
res = wayback(['--enable-auto-fetch'])
|
res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '2014-01-03 00:01:02'])
|
||||||
assert res.extra_config['enable_auto_fetch'] == True
|
assert res.application.proxy_default_timestamp == '20140103000102'
|
||||||
|
|
||||||
|
def test_proxy_cli_ts(self):
|
||||||
|
res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '20140103000102'])
|
||||||
|
assert res.application.proxy_default_timestamp == '20140103000102'
|
||||||
|
|
||||||
|
def test_proxy_cli_ts_err_invalid_ts(self):
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
res = wayback(['--proxy', 'test', '--proxy-default-timestamp', '2014abc'])
|
||||||
|
|
||||||
def test_proxy_cli_rec(self):
|
def test_proxy_cli_rec(self):
|
||||||
res = wayback(['--proxy', 'test', '--proxy-record'])
|
res = wayback(['--proxy', 'test', '--proxy-record'])
|
||||||
@ -36,6 +46,10 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
|||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
res = wayback(['--proxy', 'test/foo'])
|
res = wayback(['--proxy', 'test/foo'])
|
||||||
|
|
||||||
|
def test_auto_fetch_cli(self):
|
||||||
|
res = wayback(['--enable-auto-fetch'])
|
||||||
|
assert res.extra_config['enable_auto_fetch'] == True
|
||||||
|
|
||||||
def test_all_cli(self):
|
def test_all_cli(self):
|
||||||
res = wayback(['--all-coll', 'all'])
|
res = wayback(['--all-coll', 'all'])
|
||||||
assert res.extra_config['collections']['all'] == '$all'
|
assert res.extra_config['collections']['all'] == '$all'
|
||||||
|
@ -109,6 +109,37 @@ class TestProxy(BaseTestProxy):
|
|||||||
assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
|
assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TestProxyDefaultDate(BaseTestProxy):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
super(TestProxyDefaultDate, cls).setup_class(proxy_opts={'default_timestamp': '20111226181251'})
|
||||||
|
|
||||||
|
def test_proxy_default_replay_dt(self, scheme):
|
||||||
|
res = requests.get('{0}://example.com/'.format(scheme),
|
||||||
|
proxies=self.proxies,
|
||||||
|
verify=self.root_ca_file)
|
||||||
|
|
||||||
|
assert 'WB Insert' in res.text
|
||||||
|
assert 'Example Domain' in res.text
|
||||||
|
|
||||||
|
# no wombat.js and wombatProxyMode.js
|
||||||
|
assert 'wombat.js' not in res.text
|
||||||
|
assert 'wombatProxyMode.js' not in res.text
|
||||||
|
|
||||||
|
# no auto fetch
|
||||||
|
assert 'wbinfo.enable_auto_fetch = false;' in res.text
|
||||||
|
|
||||||
|
# banner
|
||||||
|
assert 'default_banner.js' in res.text
|
||||||
|
|
||||||
|
# no redirect check
|
||||||
|
assert 'window == window.top' not in res.text
|
||||||
|
|
||||||
|
assert res.headers['Link'] == '<http://test@example.com/>; rel="memento"; datetime="Mon, 29 Jul 2013 19:51:51 GMT"; collection="pywb"'
|
||||||
|
assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TestRecordingProxy(HttpBinLiveTests, CollsDirMixin, BaseTestProxy):
|
class TestRecordingProxy(HttpBinLiveTests, CollsDirMixin, BaseTestProxy):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -4,13 +4,24 @@ BASE=2.2
|
|||||||
|
|
||||||
NOW=$(date +%Y%m%d)
|
NOW=$(date +%Y%m%d)
|
||||||
|
|
||||||
TAG="$BASE.$NOW"
|
VERSION="$BASE.$NOW"
|
||||||
|
|
||||||
# Update
|
# Update
|
||||||
sed -i='' -E "s/(__version__ = ').*$/\1$TAG'/" ./pywb/version.py
|
echo "Updating version to $VERSION"
|
||||||
git commit -m "version: update to $TAG" ./pywb/version.py
|
sed -i='' -E "s/(__version__ = ').*$/\1$VERSION'/" ./pywb/version.py
|
||||||
|
|
||||||
|
if [ "$1" != "commit" ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
TAG=v-$VERSION
|
||||||
|
|
||||||
|
echo "Committing Tag $TAG"
|
||||||
|
|
||||||
|
git commit -m "version: update to $VERSION" ./pywb/version.py
|
||||||
git push
|
git push
|
||||||
|
|
||||||
# Tag
|
# Tag
|
||||||
git tag v-$TAG
|
git tag $TAG
|
||||||
git push origin v-$TAG
|
git push origin $TAG
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user