diff --git a/setup.py b/setup.py index fee3441c..cada7efc 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ class PyTest(TestCommand): import pytest import sys import os - cmdline = ' --cov webagg -v test/' + cmdline = ' --cov webagg -vv test/' errcode = pytest.main(cmdline) sys.exit(errcode) diff --git a/test/test_dir_agg.py b/test/test_dir_agg.py index 2500b9cf..9d2db560 100644 --- a/test/test_dir_agg.py +++ b/test/test_dir_agg.py @@ -5,6 +5,8 @@ import json from .testutils import to_path +from mock import patch + from webagg.aggregator import DirectoryIndexSource, SimpleAggregator from webagg.indexsource import MementoIndexSource @@ -14,6 +16,10 @@ root_dir = None orig_cwd = None dir_loader = None +linkheader = """\ +; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\ +""" + def setup_module(): global root_dir root_dir = tempfile.mkdtemp() @@ -124,7 +130,10 @@ def test_agg_all_found_2(): assert(errs == {}) +def mock_link_header(*args, **kwargs): + return linkheader +@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) def test_agg_dir_and_memento(): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': dir_loader} @@ -133,9 +142,9 @@ def test_agg_dir_and_memento(): res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ - {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, - {'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} diff --git a/test/test_memento_agg.py b/test/test_memento_agg.py index 88f36daf..934a9474 100644 --- a/test/test_memento_agg.py +++ b/test/test_memento_agg.py @@ -73,10 +73,11 @@ def test_mem_agg_index_2(agg): exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"}, {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"}, - {"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"}, + #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"}, {"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"}, + {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}, {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"}, - {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}] + {"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}] assert(json_list(res) == exp) assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"}) diff --git a/webagg/handlers.py b/webagg/handlers.py index 55529156..d9c06f96 100644 --- a/webagg/handlers.py +++ b/webagg/handlers.py @@ -118,7 +118,6 @@ class ResourceHandler(IndexHandler): class DefaultResourceHandler(ResourceHandler): def __init__(self, index_source, warc_paths=''): loaders = [WARCPathLoader(warc_paths, index_source), - # UpstreamProxyLoader(), LiveWebLoader(), ] super(DefaultResourceHandler, self).__init__(index_source, loaders) diff --git a/webagg/indexsource.py b/webagg/indexsource.py index 6989b894..c83d3006 100644 --- a/webagg/indexsource.py +++ b/webagg/indexsource.py @@ -2,12 +2,11 @@ import redis from pywb.utils.binsearch import iter_range from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp -from pywb.utils.timeutils import timestamp_to_sec, timestamp_now -from pywb.utils.canonicalize import canonicalize, calc_search_range +from pywb.utils.timeutils import timestamp_now +from pywb.utils.canonicalize import canonicalize from pywb.utils.wbexception import NotFoundException from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.query import CDXQuery from webagg.liverec import patched_requests as requests @@ -80,27 +79,17 @@ class RemoteIndexSource(BaseIndexSource): return 'remote' -#============================================================================= -class UpstreamAggIndexSource(RemoteIndexSource): - def __init__(self, base_url): - api_url = base_url + '/index?url={url}' - proxy_url = base_url + '/resource?url={url}&closest={timestamp}' - super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') - - def _set_load_url(self, cdx): - super(UpstreamAggIndexSource, self)._set_load_url(cdx) - cdx['offset'] = '0' - cdx.pop('load_url', '') - - #============================================================================= class LiveIndexSource(BaseIndexSource): + def __init__(self, proxy_url='{url}'): + self.proxy_url = proxy_url + def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] - cdx['load_url'] = params['url'] + cdx['load_url'] = res_template(self.proxy_url, params) cdx['is_live'] = 'true' def live(): yield cdx diff --git a/webagg/proxyindexsource.py b/webagg/proxyindexsource.py new file mode 100644 index 00000000..435c9240 --- /dev/null +++ b/webagg/proxyindexsource.py @@ -0,0 +1,54 @@ +from pywb.cdx.cdxobject import CDXObject +from pywb.utils.wbexception import NotFoundException +from webagg.indexsource import BaseIndexSource, RemoteIndexSource +from webagg.responseloader import LiveWebLoader +from webagg.utils import ParamFormatter, res_template +from pywb.utils.timeutils import timestamp_now + + +#============================================================================= +class UpstreamAggIndexSource(RemoteIndexSource): + def __init__(self, base_url): + api_url = base_url + '/index?url={url}' + proxy_url = base_url + '/resource?url={url}&closest={timestamp}' + super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') + + def _set_load_url(self, cdx): + super(UpstreamAggIndexSource, self)._set_load_url(cdx) + cdx['offset'] = '0' + cdx.pop('load_url', '') + + +#============================================================================= +class ProxyMementoIndexSource(BaseIndexSource): + def __init__(self, proxy_url='{url}'): + self.proxy_url = proxy_url + self.loader = LiveWebLoader() + + def load_index(self, params): + cdx = CDXObject() + cdx['urlkey'] = params.get('key').decode('utf-8') + + closest = params.get('closest') + cdx['timestamp'] = closest if closest else timestamp_now() + cdx['url'] = params['url'] + cdx['load_url'] = res_template(self.proxy_url, params) + cdx['memento_url'] = cdx['load_url'] + return self._do_load(cdx, params) + + def _do_load(self, cdx, params): + result = self.loader.load_resource(cdx, params) + if not result: + raise NotFoundException('Not a memento: ' + cdx['url']) + + cdx['_cached_result'] = result + yield cdx + + def __str__(self): + return 'proxy' + + @staticmethod + def upstream_resource(base_url): + return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}') + + diff --git a/webagg/responseloader.py b/webagg/responseloader.py index d29b629d..82d98e41 100644 --- a/webagg/responseloader.py +++ b/webagg/responseloader.py @@ -3,10 +3,10 @@ from webagg.liverec import request as remote_request from webagg.utils import MementoUtils -from requests import session +from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp +from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date +from pywb.utils.timeutils import http_date_to_datetime, datetime_to_http_date -from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date -from pywb.utils.timeutils import iso_date_to_datetime from pywb.utils.wbexception import LiveResourceException from pywb.utils.statusandheaders import StatusAndHeaders @@ -62,21 +62,32 @@ class StreamIter(six.Iterator): #============================================================================= class BaseLoader(object): def __call__(self, cdx, params): - entry = self._load_resource(cdx, params) + entry = self.load_resource(cdx, params) if not entry: return None, None - warc_headers, other_headers_buff, stream = entry + warc_headers, other_headers, stream = entry out_headers = {} + out_headers['WebAgg-Type'] = 'warc' out_headers['Source-Coll'] = cdx.get('source', '') + out_headers['Content-Type'] = 'application/warc-record' + + if not warc_headers: + if other_headers: + out_headers['Link'] = other_headers.get('Link') + out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime') + out_headers['Content-Length'] = other_headers.get('Content-Length') + + #for n, v in other_headers.items(): + # out_headers[n] = v + + return out_headers, StreamIter(stream) out_headers['Link'] = MementoUtils.make_link( warc_headers.get_header('WARC-Target-URI'), 'original') - out_headers['Content-Type'] = 'application/warc-record' - memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) @@ -88,7 +99,7 @@ class BaseLoader(object): return out_headers, StreamIter(stream, header1=warc_headers_buff, - header2=other_headers_buff) + header2=other_headers) def _set_content_len(self, content_len_str, headers, existing_len): # Try to set content-length, if it is available and valid @@ -134,7 +145,10 @@ class WARCPathLoader(BaseLoader): yield check - def _load_resource(self, cdx, params): + def load_resource(self, cdx, params): + if cdx.get('_cached_result'): + return cdx.get('_cached_result') + if not cdx.get('filename') or cdx.get('offset') is None: return None @@ -174,7 +188,7 @@ class LiveWebLoader(BaseLoader): b'content-location', b'x-archive') - def _load_resource(self, cdx, params): + def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None @@ -187,7 +201,7 @@ class LiveWebLoader(BaseLoader): dt = timestamp_to_datetime(cdx['timestamp']) - if not cdx.get('is_live'): + if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) # if different url, ensure origin is not set @@ -212,6 +226,20 @@ class LiveWebLoader(BaseLoader): except Exception as e: raise LiveResourceException(load_url) + memento_dt = upstream_res.headers.get('Memento-Datetime') + if memento_dt: + dt = http_date_to_datetime(memento_dt) + cdx['timestamp'] = datetime_to_timestamp(dt) + elif cdx.get('memento_url'): + # if 'memento_url' set and no Memento-Datetime header present + # then its an error + return None + + agg_type = upstream_res.headers.get('WebAgg-Type') + if agg_type == 'warc': + cdx['source'] = upstream_res.headers.get('Source-Coll') + return None, upstream_res.headers, upstream_res.raw + http_headers_buff = recorder.get_headers_buff() warc_headers = {} @@ -219,7 +247,7 @@ class LiveWebLoader(BaseLoader): warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] - warc_headers['WARC-Date'] = self._make_date(dt) + warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if recorder.target_ip: warc_headers['WARC-IP-Address'] = recorder.target_ip @@ -232,10 +260,6 @@ class LiveWebLoader(BaseLoader): warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res.raw) - @staticmethod - def _make_date(dt): - return dt.strftime('%Y-%m-%dT%H:%M:%SZ') - @staticmethod def _make_warc_id(id_=None): if not id_: