add ProxyLiveIndexSource for proxying upstream conn directly w/o a second index query

liveloader: if 'memento_url' key is set, then memento-datetime header must be present or its an error response liveindexsource: add option to specify custom live path (eg. prefix for cacheing) fix test cases changed due to ia (todo: mock up all external data!)
2025-03-15 08:04:49 +01:00 · 2016-03-08 10:27:13 -08:00 · 2016-03-08 10:27:13 -08:00 · 107ba9aabc
commit 107ba9aabc
parent c1895ae70f
7 changed files with 115 additions and 39 deletions
--- a/setup.py
+++ b/setup.py
@ -14,7 +14,7 @@ class PyTest(TestCommand):
        import pytest
        import sys
        import os
-        cmdline = ' --cov webagg -v test/'
+        cmdline = ' --cov webagg -vv test/'
        errcode = pytest.main(cmdline)
        sys.exit(errcode)

--- a/test/test_dir_agg.py
+++ b/test/test_dir_agg.py
@ -5,6 +5,8 @@ import json

 from .testutils import to_path

+from mock import patch
+
 from webagg.aggregator import DirectoryIndexSource, SimpleAggregator
 from webagg.indexsource import MementoIndexSource

@ -14,6 +16,10 @@ root_dir = None
 orig_cwd = None
 dir_loader = None

+linkheader = """\
+<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
+"""
+
 def setup_module():
    global root_dir
    root_dir = tempfile.mkdtemp()
@ -124,7 +130,10 @@ def test_agg_all_found_2():
    assert(errs == {})


+def mock_link_header(*args, **kwargs):
+    return linkheader

+@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
 def test_agg_dir_and_memento():
    sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
               'local': dir_loader}
@ -133,9 +142,9 @@ def test_agg_dir_and_memento():
    res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

    exp = [
-        {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
        {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
-        {'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
+        {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
+        {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
        {'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
        {'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
        {'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
--- a/test/test_memento_agg.py
+++ b/test/test_memento_agg.py
@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg):

    exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
            {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
-            {"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
+            #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
            {"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
+            {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
            {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
-            {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
+            {"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]

    assert(json_list(res) == exp)
    assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
--- a/webagg/handlers.py
+++ b/webagg/handlers.py
@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler):
 class DefaultResourceHandler(ResourceHandler):
    def __init__(self, index_source, warc_paths=''):
        loaders = [WARCPathLoader(warc_paths, index_source),
-               #    UpstreamProxyLoader(),
                   LiveWebLoader(),
                  ]
        super(DefaultResourceHandler, self).__init__(index_source, loaders)
--- a/webagg/indexsource.py
+++ b/webagg/indexsource.py
@ -2,12 +2,11 @@ import redis

 from pywb.utils.binsearch import iter_range
 from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
-from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
-from pywb.utils.canonicalize import canonicalize, calc_search_range
+from pywb.utils.timeutils import timestamp_now
+from pywb.utils.canonicalize import canonicalize
 from pywb.utils.wbexception import NotFoundException

 from pywb.cdx.cdxobject import CDXObject
-from pywb.cdx.query import CDXQuery

 from webagg.liverec import patched_requests as requests

@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource):
        return 'remote'


-#=============================================================================
-class UpstreamAggIndexSource(RemoteIndexSource):
-    def __init__(self, base_url):
-        api_url = base_url + '/index?url={url}'
-        proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
-        super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
-
-    def _set_load_url(self, cdx):
-        super(UpstreamAggIndexSource, self)._set_load_url(cdx)
-        cdx['offset'] = '0'
-        cdx.pop('load_url', '')
-
-
 #=============================================================================
 class LiveIndexSource(BaseIndexSource):
+    def __init__(self, proxy_url='{url}'):
+        self.proxy_url = proxy_url
+
    def load_index(self, params):
        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')
        cdx['timestamp'] = timestamp_now()
        cdx['url'] = params['url']
-        cdx['load_url'] = params['url']
+        cdx['load_url'] = res_template(self.proxy_url, params)
        cdx['is_live'] = 'true'
        def live():
            yield cdx
--- a/webagg/proxyindexsource.py
+++ b/webagg/proxyindexsource.py
@ -0,0 +1,54 @@
+from pywb.cdx.cdxobject import CDXObject
+from pywb.utils.wbexception import NotFoundException
+from webagg.indexsource import BaseIndexSource, RemoteIndexSource
+from webagg.responseloader import LiveWebLoader
+from webagg.utils import ParamFormatter, res_template
+from pywb.utils.timeutils import timestamp_now
+
+
+#=============================================================================
+class UpstreamAggIndexSource(RemoteIndexSource):
+    def __init__(self, base_url):
+        api_url = base_url + '/index?url={url}'
+        proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
+        super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
+
+    def _set_load_url(self, cdx):
+        super(UpstreamAggIndexSource, self)._set_load_url(cdx)
+        cdx['offset'] = '0'
+        cdx.pop('load_url', '')
+
+
+#=============================================================================
+class ProxyMementoIndexSource(BaseIndexSource):
+    def __init__(self, proxy_url='{url}'):
+        self.proxy_url = proxy_url
+        self.loader = LiveWebLoader()
+
+    def load_index(self, params):
+        cdx = CDXObject()
+        cdx['urlkey'] = params.get('key').decode('utf-8')
+
+        closest = params.get('closest')
+        cdx['timestamp'] = closest if closest else timestamp_now()
+        cdx['url'] = params['url']
+        cdx['load_url'] = res_template(self.proxy_url, params)
+        cdx['memento_url'] = cdx['load_url']
+        return self._do_load(cdx, params)
+
+    def _do_load(self, cdx, params):
+        result = self.loader.load_resource(cdx, params)
+        if not result:
+            raise NotFoundException('Not a memento: ' + cdx['url'])
+
+        cdx['_cached_result'] = result
+        yield cdx
+
+    def __str__(self):
+        return 'proxy'
+
+    @staticmethod
+    def upstream_resource(base_url):
+        return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
+
+
--- a/webagg/responseloader.py
+++ b/webagg/responseloader.py
@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request

 from webagg.utils import MementoUtils

-from requests import session
+from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
+from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
+from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date

-from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
-from pywb.utils.timeutils import iso_date_to_datetime
 from pywb.utils.wbexception import LiveResourceException
 from pywb.utils.statusandheaders import StatusAndHeaders

@ -62,21 +62,32 @@ class StreamIter(six.Iterator):
 #=============================================================================
 class BaseLoader(object):
    def __call__(self, cdx, params):
-        entry = self._load_resource(cdx, params)
+        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

-        warc_headers, other_headers_buff, stream = entry
+        warc_headers, other_headers, stream = entry

        out_headers = {}
+        out_headers['WebAgg-Type'] = 'warc'
        out_headers['Source-Coll'] = cdx.get('source', '')
+        out_headers['Content-Type'] = 'application/warc-record'
+
+        if not warc_headers:
+            if other_headers:
+                out_headers['Link'] = other_headers.get('Link')
+                out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
+                out_headers['Content-Length'] = other_headers.get('Content-Length')
+
+                #for n, v in other_headers.items():
+                #    out_headers[n] = v
+
+            return out_headers, StreamIter(stream)

        out_headers['Link'] = MementoUtils.make_link(
                                warc_headers.get_header('WARC-Target-URI'),
                                'original')

-        out_headers['Content-Type'] = 'application/warc-record'
-
        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

@ -88,7 +99,7 @@ class BaseLoader(object):

        return out_headers, StreamIter(stream,
                                       header1=warc_headers_buff,
-                                       header2=other_headers_buff)
+                                       header2=other_headers)

    def _set_content_len(self, content_len_str, headers, existing_len):
        # Try to set content-length, if it is available and valid
@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader):

            yield check

-    def _load_resource(self, cdx, params):
+    def load_resource(self, cdx, params):
+        if cdx.get('_cached_result'):
+            return cdx.get('_cached_result')
+
        if not cdx.get('filename') or cdx.get('offset') is None:
            return None

@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader):
                    b'content-location',
                    b'x-archive')

-    def _load_resource(self, cdx, params):
+    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None
@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader):

        dt = timestamp_to_datetime(cdx['timestamp'])

-        if not cdx.get('is_live'):
+        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        # if different url, ensure origin is not set
@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader):
        except Exception as e:
            raise LiveResourceException(load_url)

+        memento_dt = upstream_res.headers.get('Memento-Datetime')
+        if memento_dt:
+            dt = http_date_to_datetime(memento_dt)
+            cdx['timestamp'] = datetime_to_timestamp(dt)
+        elif cdx.get('memento_url'):
+        # if 'memento_url' set and no Memento-Datetime header present
+        # then its an error
+            return None
+
+        agg_type = upstream_res.headers.get('WebAgg-Type')
+        if agg_type == 'warc':
+            cdx['source'] = upstream_res.headers.get('Source-Coll')
+            return None, upstream_res.headers, upstream_res.raw
+
        http_headers_buff = recorder.get_headers_buff()

        warc_headers = {}
@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader):
        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
-        warc_headers['WARC-Date'] = self._make_date(dt)
+        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        if recorder.target_ip:
            warc_headers['WARC-IP-Address'] = recorder.target_ip

@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader):
        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res.raw)

-    @staticmethod
-    def _make_date(dt):
-        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
-
    @staticmethod
    def _make_warc_id(id_=None):
        if not id_: