mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
add ProxyLiveIndexSource for proxying upstream conn directly w/o a second index query
liveloader: if 'memento_url' key is set, then memento-datetime header must be present or its an error response liveindexsource: add option to specify custom live path (eg. prefix for cacheing) fix test cases changed due to ia (todo: mock up all external data!)
This commit is contained in:
parent
c1895ae70f
commit
107ba9aabc
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ class PyTest(TestCommand):
|
||||
import pytest
|
||||
import sys
|
||||
import os
|
||||
cmdline = ' --cov webagg -v test/'
|
||||
cmdline = ' --cov webagg -vv test/'
|
||||
errcode = pytest.main(cmdline)
|
||||
sys.exit(errcode)
|
||||
|
||||
|
@ -5,6 +5,8 @@ import json
|
||||
|
||||
from .testutils import to_path
|
||||
|
||||
from mock import patch
|
||||
|
||||
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
|
||||
from webagg.indexsource import MementoIndexSource
|
||||
|
||||
@ -14,6 +16,10 @@ root_dir = None
|
||||
orig_cwd = None
|
||||
dir_loader = None
|
||||
|
||||
linkheader = """\
|
||||
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
|
||||
"""
|
||||
|
||||
def setup_module():
|
||||
global root_dir
|
||||
root_dir = tempfile.mkdtemp()
|
||||
@ -124,7 +130,10 @@ def test_agg_all_found_2():
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def mock_link_header(*args, **kwargs):
|
||||
return linkheader
|
||||
|
||||
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
|
||||
def test_agg_dir_and_memento():
|
||||
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'local': dir_loader}
|
||||
@ -133,9 +142,9 @@ def test_agg_dir_and_memento():
|
||||
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
|
||||
exp = [
|
||||
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
|
@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg):
|
||||
|
||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
|
||||
{"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
|
||||
|
||||
assert(json_list(res) == exp)
|
||||
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
||||
|
@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler):
|
||||
class DefaultResourceHandler(ResourceHandler):
|
||||
def __init__(self, index_source, warc_paths=''):
|
||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||
# UpstreamProxyLoader(),
|
||||
LiveWebLoader(),
|
||||
]
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||
|
@ -2,12 +2,11 @@ import redis
|
||||
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
|
||||
from pywb.utils.canonicalize import canonicalize, calc_search_range
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.query import CDXQuery
|
||||
|
||||
from webagg.liverec import patched_requests as requests
|
||||
|
||||
@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
return 'remote'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class UpstreamAggIndexSource(RemoteIndexSource):
|
||||
def __init__(self, base_url):
|
||||
api_url = base_url + '/index?url={url}'
|
||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||
|
||||
def _set_load_url(self, cdx):
|
||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
||||
cdx['offset'] = '0'
|
||||
cdx.pop('load_url', '')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveIndexSource(BaseIndexSource):
|
||||
def __init__(self, proxy_url='{url}'):
|
||||
self.proxy_url = proxy_url
|
||||
|
||||
def load_index(self, params):
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||
cdx['timestamp'] = timestamp_now()
|
||||
cdx['url'] = params['url']
|
||||
cdx['load_url'] = params['url']
|
||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||
cdx['is_live'] = 'true'
|
||||
def live():
|
||||
yield cdx
|
||||
|
54
webagg/proxyindexsource.py
Normal file
54
webagg/proxyindexsource.py
Normal file
@ -0,0 +1,54 @@
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||
from webagg.responseloader import LiveWebLoader
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class UpstreamAggIndexSource(RemoteIndexSource):
|
||||
def __init__(self, base_url):
|
||||
api_url = base_url + '/index?url={url}'
|
||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||
|
||||
def _set_load_url(self, cdx):
|
||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
||||
cdx['offset'] = '0'
|
||||
cdx.pop('load_url', '')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ProxyMementoIndexSource(BaseIndexSource):
|
||||
def __init__(self, proxy_url='{url}'):
|
||||
self.proxy_url = proxy_url
|
||||
self.loader = LiveWebLoader()
|
||||
|
||||
def load_index(self, params):
|
||||
cdx = CDXObject()
|
||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||
|
||||
closest = params.get('closest')
|
||||
cdx['timestamp'] = closest if closest else timestamp_now()
|
||||
cdx['url'] = params['url']
|
||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||
cdx['memento_url'] = cdx['load_url']
|
||||
return self._do_load(cdx, params)
|
||||
|
||||
def _do_load(self, cdx, params):
|
||||
result = self.loader.load_resource(cdx, params)
|
||||
if not result:
|
||||
raise NotFoundException('Not a memento: ' + cdx['url'])
|
||||
|
||||
cdx['_cached_result'] = result
|
||||
yield cdx
|
||||
|
||||
def __str__(self):
|
||||
return 'proxy'
|
||||
|
||||
@staticmethod
|
||||
def upstream_resource(base_url):
|
||||
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
|
||||
|
||||
|
@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request
|
||||
|
||||
from webagg.utils import MementoUtils
|
||||
|
||||
from requests import session
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||
from pywb.utils.timeutils import iso_date_to_datetime
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
@ -62,21 +62,32 @@ class StreamIter(six.Iterator):
|
||||
#=============================================================================
|
||||
class BaseLoader(object):
|
||||
def __call__(self, cdx, params):
|
||||
entry = self._load_resource(cdx, params)
|
||||
entry = self.load_resource(cdx, params)
|
||||
if not entry:
|
||||
return None, None
|
||||
|
||||
warc_headers, other_headers_buff, stream = entry
|
||||
warc_headers, other_headers, stream = entry
|
||||
|
||||
out_headers = {}
|
||||
out_headers['WebAgg-Type'] = 'warc'
|
||||
out_headers['Source-Coll'] = cdx.get('source', '')
|
||||
out_headers['Content-Type'] = 'application/warc-record'
|
||||
|
||||
if not warc_headers:
|
||||
if other_headers:
|
||||
out_headers['Link'] = other_headers.get('Link')
|
||||
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
|
||||
out_headers['Content-Length'] = other_headers.get('Content-Length')
|
||||
|
||||
#for n, v in other_headers.items():
|
||||
# out_headers[n] = v
|
||||
|
||||
return out_headers, StreamIter(stream)
|
||||
|
||||
out_headers['Link'] = MementoUtils.make_link(
|
||||
warc_headers.get_header('WARC-Target-URI'),
|
||||
'original')
|
||||
|
||||
out_headers['Content-Type'] = 'application/warc-record'
|
||||
|
||||
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
||||
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||
|
||||
@ -88,7 +99,7 @@ class BaseLoader(object):
|
||||
|
||||
return out_headers, StreamIter(stream,
|
||||
header1=warc_headers_buff,
|
||||
header2=other_headers_buff)
|
||||
header2=other_headers)
|
||||
|
||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||
# Try to set content-length, if it is available and valid
|
||||
@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader):
|
||||
|
||||
yield check
|
||||
|
||||
def _load_resource(self, cdx, params):
|
||||
def load_resource(self, cdx, params):
|
||||
if cdx.get('_cached_result'):
|
||||
return cdx.get('_cached_result')
|
||||
|
||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||
return None
|
||||
|
||||
@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader):
|
||||
b'content-location',
|
||||
b'x-archive')
|
||||
|
||||
def _load_resource(self, cdx, params):
|
||||
def load_resource(self, cdx, params):
|
||||
load_url = cdx.get('load_url')
|
||||
if not load_url:
|
||||
return None
|
||||
@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader):
|
||||
|
||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||
|
||||
if not cdx.get('is_live'):
|
||||
if cdx.get('memento_url'):
|
||||
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
||||
|
||||
# if different url, ensure origin is not set
|
||||
@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader):
|
||||
except Exception as e:
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
memento_dt = upstream_res.headers.get('Memento-Datetime')
|
||||
if memento_dt:
|
||||
dt = http_date_to_datetime(memento_dt)
|
||||
cdx['timestamp'] = datetime_to_timestamp(dt)
|
||||
elif cdx.get('memento_url'):
|
||||
# if 'memento_url' set and no Memento-Datetime header present
|
||||
# then its an error
|
||||
return None
|
||||
|
||||
agg_type = upstream_res.headers.get('WebAgg-Type')
|
||||
if agg_type == 'warc':
|
||||
cdx['source'] = upstream_res.headers.get('Source-Coll')
|
||||
return None, upstream_res.headers, upstream_res.raw
|
||||
|
||||
http_headers_buff = recorder.get_headers_buff()
|
||||
|
||||
warc_headers = {}
|
||||
@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader):
|
||||
warc_headers['WARC-Type'] = 'response'
|
||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
warc_headers['WARC-Target-URI'] = cdx['url']
|
||||
warc_headers['WARC-Date'] = self._make_date(dt)
|
||||
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||
if recorder.target_ip:
|
||||
warc_headers['WARC-IP-Address'] = recorder.target_ip
|
||||
|
||||
@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader):
|
||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||
return (warc_headers, http_headers_buff, upstream_res.raw)
|
||||
|
||||
@staticmethod
|
||||
def _make_date(dt):
|
||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
@staticmethod
|
||||
def _make_warc_id(id_=None):
|
||||
if not id_:
|
||||
|
Loading…
x
Reference in New Issue
Block a user