1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

add ProxyLiveIndexSource for proxying upstream conn directly w/o a second index query

liveloader: if 'memento_url' key is set, then memento-datetime header must be present or its an error response
liveindexsource: add option to specify custom live path (eg. prefix for cacheing)
fix test cases changed due to ia (todo: mock up all external data!)
This commit is contained in:
Ilya Kreymer 2016-03-08 10:27:13 -08:00
parent c1895ae70f
commit 107ba9aabc
7 changed files with 115 additions and 39 deletions

View File

@ -14,7 +14,7 @@ class PyTest(TestCommand):
import pytest
import sys
import os
cmdline = ' --cov webagg -v test/'
cmdline = ' --cov webagg -vv test/'
errcode = pytest.main(cmdline)
sys.exit(errcode)

View File

@ -5,6 +5,8 @@ import json
from .testutils import to_path
from mock import patch
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
from webagg.indexsource import MementoIndexSource
@ -14,6 +16,10 @@ root_dir = None
orig_cwd = None
dir_loader = None
linkheader = """\
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
"""
def setup_module():
global root_dir
root_dir = tempfile.mkdtemp()
@ -124,7 +130,10 @@ def test_agg_all_found_2():
assert(errs == {})
def mock_link_header(*args, **kwargs):
return linkheader
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento():
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': dir_loader}
@ -133,9 +142,9 @@ def test_agg_dir_and_memento():
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}

View File

@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg):
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
{"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
assert(json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})

View File

@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler):
class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathLoader(warc_paths, index_source),
# UpstreamProxyLoader(),
LiveWebLoader(),
]
super(DefaultResourceHandler, self).__init__(index_source, loaders)

View File

@ -2,12 +2,11 @@ import redis
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
from pywb.utils.canonicalize import canonicalize, calc_search_range
from pywb.utils.timeutils import timestamp_now
from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.query import CDXQuery
from webagg.liverec import patched_requests as requests
@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource):
return 'remote'
#=============================================================================
class UpstreamAggIndexSource(RemoteIndexSource):
def __init__(self, base_url):
api_url = base_url + '/index?url={url}'
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
def _set_load_url(self, cdx):
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
cdx['offset'] = '0'
cdx.pop('load_url', '')
#=============================================================================
class LiveIndexSource(BaseIndexSource):
def __init__(self, proxy_url='{url}'):
self.proxy_url = proxy_url
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('key').decode('utf-8')
cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = params['url']
cdx['load_url'] = res_template(self.proxy_url, params)
cdx['is_live'] = 'true'
def live():
yield cdx

View File

@ -0,0 +1,54 @@
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.wbexception import NotFoundException
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
from webagg.responseloader import LiveWebLoader
from webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now
#=============================================================================
class UpstreamAggIndexSource(RemoteIndexSource):
def __init__(self, base_url):
api_url = base_url + '/index?url={url}'
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
def _set_load_url(self, cdx):
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
cdx['offset'] = '0'
cdx.pop('load_url', '')
#=============================================================================
class ProxyMementoIndexSource(BaseIndexSource):
def __init__(self, proxy_url='{url}'):
self.proxy_url = proxy_url
self.loader = LiveWebLoader()
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('key').decode('utf-8')
closest = params.get('closest')
cdx['timestamp'] = closest if closest else timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = res_template(self.proxy_url, params)
cdx['memento_url'] = cdx['load_url']
return self._do_load(cdx, params)
def _do_load(self, cdx, params):
result = self.loader.load_resource(cdx, params)
if not result:
raise NotFoundException('Not a memento: ' + cdx['url'])
cdx['_cached_result'] = result
yield cdx
def __str__(self):
return 'proxy'
@staticmethod
def upstream_resource(base_url):
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')

View File

@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request
from webagg.utils import MementoUtils
from requests import session
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
from pywb.utils.timeutils import iso_date_to_datetime
from pywb.utils.wbexception import LiveResourceException
from pywb.utils.statusandheaders import StatusAndHeaders
@ -62,21 +62,32 @@ class StreamIter(six.Iterator):
#=============================================================================
class BaseLoader(object):
def __call__(self, cdx, params):
entry = self._load_resource(cdx, params)
entry = self.load_resource(cdx, params)
if not entry:
return None, None
warc_headers, other_headers_buff, stream = entry
warc_headers, other_headers, stream = entry
out_headers = {}
out_headers['WebAgg-Type'] = 'warc'
out_headers['Source-Coll'] = cdx.get('source', '')
out_headers['Content-Type'] = 'application/warc-record'
if not warc_headers:
if other_headers:
out_headers['Link'] = other_headers.get('Link')
out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
out_headers['Content-Length'] = other_headers.get('Content-Length')
#for n, v in other_headers.items():
# out_headers[n] = v
return out_headers, StreamIter(stream)
out_headers['Link'] = MementoUtils.make_link(
warc_headers.get_header('WARC-Target-URI'),
'original')
out_headers['Content-Type'] = 'application/warc-record'
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
@ -88,7 +99,7 @@ class BaseLoader(object):
return out_headers, StreamIter(stream,
header1=warc_headers_buff,
header2=other_headers_buff)
header2=other_headers)
def _set_content_len(self, content_len_str, headers, existing_len):
# Try to set content-length, if it is available and valid
@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader):
yield check
def _load_resource(self, cdx, params):
def load_resource(self, cdx, params):
if cdx.get('_cached_result'):
return cdx.get('_cached_result')
if not cdx.get('filename') or cdx.get('offset') is None:
return None
@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader):
b'content-location',
b'x-archive')
def _load_resource(self, cdx, params):
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
return None
@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader):
dt = timestamp_to_datetime(cdx['timestamp'])
if not cdx.get('is_live'):
if cdx.get('memento_url'):
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
# if different url, ensure origin is not set
@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader):
except Exception as e:
raise LiveResourceException(load_url)
memento_dt = upstream_res.headers.get('Memento-Datetime')
if memento_dt:
dt = http_date_to_datetime(memento_dt)
cdx['timestamp'] = datetime_to_timestamp(dt)
elif cdx.get('memento_url'):
# if 'memento_url' set and no Memento-Datetime header present
# then its an error
return None
agg_type = upstream_res.headers.get('WebAgg-Type')
if agg_type == 'warc':
cdx['source'] = upstream_res.headers.get('Source-Coll')
return None, upstream_res.headers, upstream_res.raw
http_headers_buff = recorder.get_headers_buff()
warc_headers = {}
@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader):
warc_headers['WARC-Type'] = 'response'
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Target-URI'] = cdx['url']
warc_headers['WARC-Date'] = self._make_date(dt)
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
if recorder.target_ip:
warc_headers['WARC-IP-Address'] = recorder.target_ip
@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader):
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res.raw)
@staticmethod
def _make_date(dt):
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
@staticmethod
def _make_warc_id(id_=None):
if not id_: