diff --git a/setup.py b/setup.py
index fee3441c..cada7efc 100755
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ class PyTest(TestCommand):
import pytest
import sys
import os
- cmdline = ' --cov webagg -v test/'
+ cmdline = ' --cov webagg -vv test/'
errcode = pytest.main(cmdline)
sys.exit(errcode)
diff --git a/test/test_dir_agg.py b/test/test_dir_agg.py
index 2500b9cf..9d2db560 100644
--- a/test/test_dir_agg.py
+++ b/test/test_dir_agg.py
@@ -5,6 +5,8 @@ import json
from .testutils import to_path
+from mock import patch
+
from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
from webagg.indexsource import MementoIndexSource
@@ -14,6 +16,10 @@ root_dir = None
orig_cwd = None
dir_loader = None
+linkheader = """\
+; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
+"""
+
def setup_module():
global root_dir
root_dir = tempfile.mkdtemp()
@@ -124,7 +130,10 @@ def test_agg_all_found_2():
assert(errs == {})
+def mock_link_header(*args, **kwargs):
+ return linkheader
+@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento():
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': dir_loader}
@@ -133,9 +142,9 @@ def test_agg_dir_and_memento():
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
- {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
- {'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
+ {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
+ {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
diff --git a/test/test_memento_agg.py b/test/test_memento_agg.py
index 88f36daf..934a9474 100644
--- a/test/test_memento_agg.py
+++ b/test/test_memento_agg.py
@@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg):
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
- {"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
+ #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
+ {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
- {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
+ {"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
assert(json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
diff --git a/webagg/handlers.py b/webagg/handlers.py
index 55529156..d9c06f96 100644
--- a/webagg/handlers.py
+++ b/webagg/handlers.py
@@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler):
class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathLoader(warc_paths, index_source),
- # UpstreamProxyLoader(),
LiveWebLoader(),
]
super(DefaultResourceHandler, self).__init__(index_source, loaders)
diff --git a/webagg/indexsource.py b/webagg/indexsource.py
index 6989b894..c83d3006 100644
--- a/webagg/indexsource.py
+++ b/webagg/indexsource.py
@@ -2,12 +2,11 @@ import redis
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
-from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
-from pywb.utils.canonicalize import canonicalize, calc_search_range
+from pywb.utils.timeutils import timestamp_now
+from pywb.utils.canonicalize import canonicalize
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
-from pywb.cdx.query import CDXQuery
from webagg.liverec import patched_requests as requests
@@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource):
return 'remote'
-#=============================================================================
-class UpstreamAggIndexSource(RemoteIndexSource):
- def __init__(self, base_url):
- api_url = base_url + '/index?url={url}'
- proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
- super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
-
- def _set_load_url(self, cdx):
- super(UpstreamAggIndexSource, self)._set_load_url(cdx)
- cdx['offset'] = '0'
- cdx.pop('load_url', '')
-
-
#=============================================================================
class LiveIndexSource(BaseIndexSource):
+ def __init__(self, proxy_url='{url}'):
+ self.proxy_url = proxy_url
+
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('key').decode('utf-8')
cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url']
- cdx['load_url'] = params['url']
+ cdx['load_url'] = res_template(self.proxy_url, params)
cdx['is_live'] = 'true'
def live():
yield cdx
diff --git a/webagg/proxyindexsource.py b/webagg/proxyindexsource.py
new file mode 100644
index 00000000..435c9240
--- /dev/null
+++ b/webagg/proxyindexsource.py
@@ -0,0 +1,54 @@
+from pywb.cdx.cdxobject import CDXObject
+from pywb.utils.wbexception import NotFoundException
+from webagg.indexsource import BaseIndexSource, RemoteIndexSource
+from webagg.responseloader import LiveWebLoader
+from webagg.utils import ParamFormatter, res_template
+from pywb.utils.timeutils import timestamp_now
+
+
+#=============================================================================
+class UpstreamAggIndexSource(RemoteIndexSource):
+ def __init__(self, base_url):
+ api_url = base_url + '/index?url={url}'
+ proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
+ super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
+
+ def _set_load_url(self, cdx):
+ super(UpstreamAggIndexSource, self)._set_load_url(cdx)
+ cdx['offset'] = '0'
+ cdx.pop('load_url', '')
+
+
+#=============================================================================
+class ProxyMementoIndexSource(BaseIndexSource):
+ def __init__(self, proxy_url='{url}'):
+ self.proxy_url = proxy_url
+ self.loader = LiveWebLoader()
+
+ def load_index(self, params):
+ cdx = CDXObject()
+ cdx['urlkey'] = params.get('key').decode('utf-8')
+
+ closest = params.get('closest')
+ cdx['timestamp'] = closest if closest else timestamp_now()
+ cdx['url'] = params['url']
+ cdx['load_url'] = res_template(self.proxy_url, params)
+ cdx['memento_url'] = cdx['load_url']
+ return self._do_load(cdx, params)
+
+ def _do_load(self, cdx, params):
+ result = self.loader.load_resource(cdx, params)
+ if not result:
+ raise NotFoundException('Not a memento: ' + cdx['url'])
+
+ cdx['_cached_result'] = result
+ yield cdx
+
+ def __str__(self):
+ return 'proxy'
+
+ @staticmethod
+ def upstream_resource(base_url):
+ return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
+
+
diff --git a/webagg/responseloader.py b/webagg/responseloader.py
index d29b629d..82d98e41 100644
--- a/webagg/responseloader.py
+++ b/webagg/responseloader.py
@@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request
from webagg.utils import MementoUtils
-from requests import session
+from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
+from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
+from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date
-from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
-from pywb.utils.timeutils import iso_date_to_datetime
from pywb.utils.wbexception import LiveResourceException
from pywb.utils.statusandheaders import StatusAndHeaders
@@ -62,21 +62,32 @@ class StreamIter(six.Iterator):
#=============================================================================
class BaseLoader(object):
def __call__(self, cdx, params):
- entry = self._load_resource(cdx, params)
+ entry = self.load_resource(cdx, params)
if not entry:
return None, None
- warc_headers, other_headers_buff, stream = entry
+ warc_headers, other_headers, stream = entry
out_headers = {}
+ out_headers['WebAgg-Type'] = 'warc'
out_headers['Source-Coll'] = cdx.get('source', '')
+ out_headers['Content-Type'] = 'application/warc-record'
+
+ if not warc_headers:
+ if other_headers:
+ out_headers['Link'] = other_headers.get('Link')
+ out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
+ out_headers['Content-Length'] = other_headers.get('Content-Length')
+
+ #for n, v in other_headers.items():
+ # out_headers[n] = v
+
+ return out_headers, StreamIter(stream)
out_headers['Link'] = MementoUtils.make_link(
warc_headers.get_header('WARC-Target-URI'),
'original')
- out_headers['Content-Type'] = 'application/warc-record'
-
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
@@ -88,7 +99,7 @@ class BaseLoader(object):
return out_headers, StreamIter(stream,
header1=warc_headers_buff,
- header2=other_headers_buff)
+ header2=other_headers)
def _set_content_len(self, content_len_str, headers, existing_len):
# Try to set content-length, if it is available and valid
@@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader):
yield check
- def _load_resource(self, cdx, params):
+ def load_resource(self, cdx, params):
+ if cdx.get('_cached_result'):
+ return cdx.get('_cached_result')
+
if not cdx.get('filename') or cdx.get('offset') is None:
return None
@@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader):
b'content-location',
b'x-archive')
- def _load_resource(self, cdx, params):
+ def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
return None
@@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader):
dt = timestamp_to_datetime(cdx['timestamp'])
- if not cdx.get('is_live'):
+ if cdx.get('memento_url'):
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
# if different url, ensure origin is not set
@@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader):
except Exception as e:
raise LiveResourceException(load_url)
+ memento_dt = upstream_res.headers.get('Memento-Datetime')
+ if memento_dt:
+ dt = http_date_to_datetime(memento_dt)
+ cdx['timestamp'] = datetime_to_timestamp(dt)
+ elif cdx.get('memento_url'):
+ # if 'memento_url' set and no Memento-Datetime header present
+ # then its an error
+ return None
+
+ agg_type = upstream_res.headers.get('WebAgg-Type')
+ if agg_type == 'warc':
+ cdx['source'] = upstream_res.headers.get('Source-Coll')
+ return None, upstream_res.headers, upstream_res.raw
+
http_headers_buff = recorder.get_headers_buff()
warc_headers = {}
@@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader):
warc_headers['WARC-Type'] = 'response'
warc_headers['WARC-Record-ID'] = self._make_warc_id()
warc_headers['WARC-Target-URI'] = cdx['url']
- warc_headers['WARC-Date'] = self._make_date(dt)
+ warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
if recorder.target_ip:
warc_headers['WARC-IP-Address'] = recorder.target_ip
@@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader):
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res.raw)
- @staticmethod
- def _make_date(dt):
- return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
-
@staticmethod
def _make_warc_id(id_=None):
if not id_: