mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
add ProxyLiveIndexSource for proxying upstream conn directly w/o a second index query
liveloader: if 'memento_url' key is set, then memento-datetime header must be present or its an error response liveindexsource: add option to specify custom live path (eg. prefix for cacheing) fix test cases changed due to ia (todo: mock up all external data!)
This commit is contained in:
parent
c1895ae70f
commit
107ba9aabc
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ class PyTest(TestCommand):
|
|||||||
import pytest
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
cmdline = ' --cov webagg -v test/'
|
cmdline = ' --cov webagg -vv test/'
|
||||||
errcode = pytest.main(cmdline)
|
errcode = pytest.main(cmdline)
|
||||||
sys.exit(errcode)
|
sys.exit(errcode)
|
||||||
|
|
||||||
|
@ -5,6 +5,8 @@ import json
|
|||||||
|
|
||||||
from .testutils import to_path
|
from .testutils import to_path
|
||||||
|
|
||||||
|
from mock import patch
|
||||||
|
|
||||||
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
|
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
|
||||||
from webagg.indexsource import MementoIndexSource
|
from webagg.indexsource import MementoIndexSource
|
||||||
|
|
||||||
@ -14,6 +16,10 @@ root_dir = None
|
|||||||
orig_cwd = None
|
orig_cwd = None
|
||||||
dir_loader = None
|
dir_loader = None
|
||||||
|
|
||||||
|
linkheader = """\
|
||||||
|
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
|
||||||
|
"""
|
||||||
|
|
||||||
def setup_module():
|
def setup_module():
|
||||||
global root_dir
|
global root_dir
|
||||||
root_dir = tempfile.mkdtemp()
|
root_dir = tempfile.mkdtemp()
|
||||||
@ -124,7 +130,10 @@ def test_agg_all_found_2():
|
|||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
|
|
||||||
|
def mock_link_header(*args, **kwargs):
|
||||||
|
return linkheader
|
||||||
|
|
||||||
|
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
|
||||||
def test_agg_dir_and_memento():
|
def test_agg_dir_and_memento():
|
||||||
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||||
'local': dir_loader}
|
'local': dir_loader}
|
||||||
@ -133,9 +142,9 @@ def test_agg_dir_and_memento():
|
|||||||
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||||
|
|
||||||
exp = [
|
exp = [
|
||||||
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
|
||||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||||
{'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||||
|
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||||
|
@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg):
|
|||||||
|
|
||||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||||
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||||
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||||
|
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
|
||||||
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
||||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
|
{"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
|
||||||
|
|
||||||
assert(json_list(res) == exp)
|
assert(json_list(res) == exp)
|
||||||
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
||||||
|
@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler):
|
|||||||
class DefaultResourceHandler(ResourceHandler):
|
class DefaultResourceHandler(ResourceHandler):
|
||||||
def __init__(self, index_source, warc_paths=''):
|
def __init__(self, index_source, warc_paths=''):
|
||||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||||
# UpstreamProxyLoader(),
|
|
||||||
LiveWebLoader(),
|
LiveWebLoader(),
|
||||||
]
|
]
|
||||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||||
|
@ -2,12 +2,11 @@ import redis
|
|||||||
|
|
||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
|
from pywb.utils.timeutils import timestamp_now
|
||||||
from pywb.utils.canonicalize import canonicalize, calc_search_range
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.cdx.query import CDXQuery
|
|
||||||
|
|
||||||
from webagg.liverec import patched_requests as requests
|
from webagg.liverec import patched_requests as requests
|
||||||
|
|
||||||
@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
return 'remote'
|
return 'remote'
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
|
||||||
class UpstreamAggIndexSource(RemoteIndexSource):
|
|
||||||
def __init__(self, base_url):
|
|
||||||
api_url = base_url + '/index?url={url}'
|
|
||||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
|
||||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
|
||||||
|
|
||||||
def _set_load_url(self, cdx):
|
|
||||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
|
||||||
cdx['offset'] = '0'
|
|
||||||
cdx.pop('load_url', '')
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class LiveIndexSource(BaseIndexSource):
|
class LiveIndexSource(BaseIndexSource):
|
||||||
|
def __init__(self, proxy_url='{url}'):
|
||||||
|
self.proxy_url = proxy_url
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||||
cdx['timestamp'] = timestamp_now()
|
cdx['timestamp'] = timestamp_now()
|
||||||
cdx['url'] = params['url']
|
cdx['url'] = params['url']
|
||||||
cdx['load_url'] = params['url']
|
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||||
cdx['is_live'] = 'true'
|
cdx['is_live'] = 'true'
|
||||||
def live():
|
def live():
|
||||||
yield cdx
|
yield cdx
|
||||||
|
54
webagg/proxyindexsource.py
Normal file
54
webagg/proxyindexsource.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||||
|
from webagg.responseloader import LiveWebLoader
|
||||||
|
from webagg.utils import ParamFormatter, res_template
|
||||||
|
from pywb.utils.timeutils import timestamp_now
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class UpstreamAggIndexSource(RemoteIndexSource):
|
||||||
|
def __init__(self, base_url):
|
||||||
|
api_url = base_url + '/index?url={url}'
|
||||||
|
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||||
|
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||||
|
|
||||||
|
def _set_load_url(self, cdx):
|
||||||
|
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
||||||
|
cdx['offset'] = '0'
|
||||||
|
cdx.pop('load_url', '')
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class ProxyMementoIndexSource(BaseIndexSource):
|
||||||
|
def __init__(self, proxy_url='{url}'):
|
||||||
|
self.proxy_url = proxy_url
|
||||||
|
self.loader = LiveWebLoader()
|
||||||
|
|
||||||
|
def load_index(self, params):
|
||||||
|
cdx = CDXObject()
|
||||||
|
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||||
|
|
||||||
|
closest = params.get('closest')
|
||||||
|
cdx['timestamp'] = closest if closest else timestamp_now()
|
||||||
|
cdx['url'] = params['url']
|
||||||
|
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||||
|
cdx['memento_url'] = cdx['load_url']
|
||||||
|
return self._do_load(cdx, params)
|
||||||
|
|
||||||
|
def _do_load(self, cdx, params):
|
||||||
|
result = self.loader.load_resource(cdx, params)
|
||||||
|
if not result:
|
||||||
|
raise NotFoundException('Not a memento: ' + cdx['url'])
|
||||||
|
|
||||||
|
cdx['_cached_result'] = result
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'proxy'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def upstream_resource(base_url):
|
||||||
|
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
|
||||||
|
|
||||||
|
|
@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request
|
|||||||
|
|
||||||
from webagg.utils import MementoUtils
|
from webagg.utils import MementoUtils
|
||||||
|
|
||||||
from requests import session
|
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||||
|
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||||
|
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
|
||||||
from pywb.utils.timeutils import iso_date_to_datetime
|
|
||||||
from pywb.utils.wbexception import LiveResourceException
|
from pywb.utils.wbexception import LiveResourceException
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
@ -62,21 +62,32 @@ class StreamIter(six.Iterator):
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseLoader(object):
|
class BaseLoader(object):
|
||||||
def __call__(self, cdx, params):
|
def __call__(self, cdx, params):
|
||||||
entry = self._load_resource(cdx, params)
|
entry = self.load_resource(cdx, params)
|
||||||
if not entry:
|
if not entry:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
warc_headers, other_headers_buff, stream = entry
|
warc_headers, other_headers, stream = entry
|
||||||
|
|
||||||
out_headers = {}
|
out_headers = {}
|
||||||
|
out_headers['WebAgg-Type'] = 'warc'
|
||||||
out_headers['Source-Coll'] = cdx.get('source', '')
|
out_headers['Source-Coll'] = cdx.get('source', '')
|
||||||
|
out_headers['Content-Type'] = 'application/warc-record'
|
||||||
|
|
||||||
|
if not warc_headers:
|
||||||
|
if other_headers:
|
||||||
|
out_headers['Link'] = other_headers.get('Link')
|
||||||
|
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
|
||||||
|
out_headers['Content-Length'] = other_headers.get('Content-Length')
|
||||||
|
|
||||||
|
#for n, v in other_headers.items():
|
||||||
|
# out_headers[n] = v
|
||||||
|
|
||||||
|
return out_headers, StreamIter(stream)
|
||||||
|
|
||||||
out_headers['Link'] = MementoUtils.make_link(
|
out_headers['Link'] = MementoUtils.make_link(
|
||||||
warc_headers.get_header('WARC-Target-URI'),
|
warc_headers.get_header('WARC-Target-URI'),
|
||||||
'original')
|
'original')
|
||||||
|
|
||||||
out_headers['Content-Type'] = 'application/warc-record'
|
|
||||||
|
|
||||||
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
||||||
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||||
|
|
||||||
@ -88,7 +99,7 @@ class BaseLoader(object):
|
|||||||
|
|
||||||
return out_headers, StreamIter(stream,
|
return out_headers, StreamIter(stream,
|
||||||
header1=warc_headers_buff,
|
header1=warc_headers_buff,
|
||||||
header2=other_headers_buff)
|
header2=other_headers)
|
||||||
|
|
||||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||||
# Try to set content-length, if it is available and valid
|
# Try to set content-length, if it is available and valid
|
||||||
@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader):
|
|||||||
|
|
||||||
yield check
|
yield check
|
||||||
|
|
||||||
def _load_resource(self, cdx, params):
|
def load_resource(self, cdx, params):
|
||||||
|
if cdx.get('_cached_result'):
|
||||||
|
return cdx.get('_cached_result')
|
||||||
|
|
||||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
b'content-location',
|
b'content-location',
|
||||||
b'x-archive')
|
b'x-archive')
|
||||||
|
|
||||||
def _load_resource(self, cdx, params):
|
def load_resource(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
if not load_url:
|
if not load_url:
|
||||||
return None
|
return None
|
||||||
@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||||
|
|
||||||
if not cdx.get('is_live'):
|
if cdx.get('memento_url'):
|
||||||
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
||||||
|
|
||||||
# if different url, ensure origin is not set
|
# if different url, ensure origin is not set
|
||||||
@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise LiveResourceException(load_url)
|
raise LiveResourceException(load_url)
|
||||||
|
|
||||||
|
memento_dt = upstream_res.headers.get('Memento-Datetime')
|
||||||
|
if memento_dt:
|
||||||
|
dt = http_date_to_datetime(memento_dt)
|
||||||
|
cdx['timestamp'] = datetime_to_timestamp(dt)
|
||||||
|
elif cdx.get('memento_url'):
|
||||||
|
# if 'memento_url' set and no Memento-Datetime header present
|
||||||
|
# then its an error
|
||||||
|
return None
|
||||||
|
|
||||||
|
agg_type = upstream_res.headers.get('WebAgg-Type')
|
||||||
|
if agg_type == 'warc':
|
||||||
|
cdx['source'] = upstream_res.headers.get('Source-Coll')
|
||||||
|
return None, upstream_res.headers, upstream_res.raw
|
||||||
|
|
||||||
http_headers_buff = recorder.get_headers_buff()
|
http_headers_buff = recorder.get_headers_buff()
|
||||||
|
|
||||||
warc_headers = {}
|
warc_headers = {}
|
||||||
@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
warc_headers['WARC-Type'] = 'response'
|
warc_headers['WARC-Type'] = 'response'
|
||||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
warc_headers['WARC-Target-URI'] = cdx['url']
|
warc_headers['WARC-Target-URI'] = cdx['url']
|
||||||
warc_headers['WARC-Date'] = self._make_date(dt)
|
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||||
if recorder.target_ip:
|
if recorder.target_ip:
|
||||||
warc_headers['WARC-IP-Address'] = recorder.target_ip
|
warc_headers['WARC-IP-Address'] = recorder.target_ip
|
||||||
|
|
||||||
@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader):
|
|||||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||||
return (warc_headers, http_headers_buff, upstream_res.raw)
|
return (warc_headers, http_headers_buff, upstream_res.raw)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _make_date(dt):
|
|
||||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_warc_id(id_=None):
|
def _make_warc_id(id_=None):
|
||||||
if not id_:
|
if not id_:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user