diff --git a/pywb/__init__.py b/pywb/__init__.py index c6233bbf..061a9bcc 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.33.0' +__version__ = '0.50.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/webagg/aggregator.py b/pywb/webagg/aggregator.py index 9ca59b52..78c14b71 100644 --- a/pywb/webagg/aggregator.py +++ b/pywb/webagg/aggregator.py @@ -15,10 +15,10 @@ from heapq import merge from collections import deque from itertools import chain -from webagg.indexsource import FileIndexSource, RedisIndexSource +from pywb.webagg.indexsource import FileIndexSource, RedisIndexSource from pywb.utils.wbexception import NotFoundException, WbException -from webagg.utils import ParamFormatter, res_template +from pywb.webagg.utils import ParamFormatter, res_template import six import glob diff --git a/pywb/webagg/app.py b/pywb/webagg/app.py index e045480b..b221c85d 100644 --- a/pywb/webagg/app.py +++ b/pywb/webagg/app.py @@ -1,4 +1,4 @@ -from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest from werkzeug.routing import Map, Rule import requests diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py index a8e067f3..b8d2bbfa 100644 --- a/pywb/webagg/handlers.py +++ b/pywb/webagg/handlers.py @@ -1,5 +1,5 @@ -from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader -from webagg.utils import MementoUtils +from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader +from pywb.webagg.utils import MementoUtils from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py index a52bb11a..76adc2ab 100644 --- a/pywb/webagg/indexsource.py +++ b/pywb/webagg/indexsource.py @@ -8,11 +8,10 @@ from pywb.utils.wbexception import NotFoundException from pywb.cdx.cdxobject import CDXObject -#from webagg.liverec import patched_requests as requests import requests -from webagg.utils import ParamFormatter, res_template -from webagg.utils import MementoUtils +from pywb.webagg.utils import ParamFormatter, res_template +from pywb.webagg.utils import MementoUtils WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}' diff --git a/pywb/webagg/proxyindexsource.py b/pywb/webagg/proxyindexsource.py index 435c9240..741f116b 100644 --- a/pywb/webagg/proxyindexsource.py +++ b/pywb/webagg/proxyindexsource.py @@ -1,8 +1,8 @@ from pywb.cdx.cdxobject import CDXObject from pywb.utils.wbexception import NotFoundException -from webagg.indexsource import BaseIndexSource, RemoteIndexSource -from webagg.responseloader import LiveWebLoader -from webagg.utils import ParamFormatter, res_template +from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource +from pywb.webagg.responseloader import LiveWebLoader +from pywb.webagg.utils import ParamFormatter, res_template from pywb.utils.timeutils import timestamp_now diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index ecda0723..ecebe82a 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -1,6 +1,6 @@ -from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter -from webagg.utils import ParamFormatter -from webagg.indexsource import RedisIndexSource +from pywb.webagg.utils import MementoUtils, StreamIter, chunk_encode_iter +from pywb.webagg.utils import ParamFormatter +from pywb.webagg.indexsource import RedisIndexSource from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date diff --git a/pywb/webagg/test/live.py b/pywb/webagg/test/live.py index 2e4f84a9..cec4564c 100644 --- a/pywb/webagg/test/live.py +++ b/pywb/webagg/test/live.py @@ -1,10 +1,10 @@ from gevent.monkey import patch_all; patch_all() -from webagg.test.testutils import LiveServerTests -from webagg.handlers import DefaultResourceHandler -from webagg.app import ResAggApp -from webagg.indexsource import LiveIndexSource, RedisIndexSource -from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource +from pywb.webagg.test.testutils import LiveServerTests +from pywb.webagg.handlers import DefaultResourceHandler +from pywb.webagg.app import ResAggApp +from pywb.webagg.indexsource import LiveIndexSource, RedisIndexSource +from pywb.webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource def simpleapp(): app = ResAggApp(debug=True) diff --git a/pywb/webagg/test/test_dir_agg.py b/pywb/webagg/test/test_dir_agg.py index bce07046..0b1c521c 100644 --- a/pywb/webagg/test/test_dir_agg.py +++ b/pywb/webagg/test/test_dir_agg.py @@ -3,15 +3,15 @@ import os import shutil import json -from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass +from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass, TEST_CDX_PATH from mock import patch import time -from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource -from webagg.aggregator import SimpleAggregator -from webagg.indexsource import MementoIndexSource +from pywb.webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource +from pywb.webagg.aggregator import SimpleAggregator +from pywb.webagg.indexsource import MementoIndexSource #============================================================================= @@ -39,9 +39,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): dir_prefix = to_path(cls.root_dir) dir_path ='colls/{coll}/indexes' - shutil.copy(to_path('testdata/example.cdxj'), coll_A) - shutil.copy(to_path('testdata/iana.cdxj'), coll_B) - shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) + shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A) + shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B) + shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C) with open(to_path(cls.root_dir) + '/somefile', 'w') as fh: fh.write('foo') @@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) - exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] + exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -108,13 +108,13 @@ class TestDirAgg(TempDirTests, BaseTestClass): exp = [ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + {'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {}) - @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} @@ -128,7 +128,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + {'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) @@ -156,7 +156,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_dir_sources_1(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) - exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', 'colls/B/indexes/iana.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } @@ -166,7 +166,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_dir_sources_2(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) - exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } @@ -177,7 +177,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '') res = loader.get_source_list({'url': 'example.com/'}) - exp = {'sources': {'example.cdxj': 'file'}} + exp = {'sources': {'example2.cdxj': 'file'}} assert(res == exp) @@ -193,7 +193,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_cache_dir_sources_1(self): - exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', 'colls/B/indexes/iana.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py index 6fb5c8d8..5eed24ac 100644 --- a/pywb/webagg/test/test_handlers.py +++ b/pywb/webagg/test/test_handlers.py @@ -2,14 +2,14 @@ from collections import OrderedDict -from webagg.handlers import DefaultResourceHandler, HandlerSeq +from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq -from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource -from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator -from webagg.aggregator import DirectoryIndexSource +from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource +from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator +from pywb.webagg.aggregator import DirectoryIndexSource -from webagg.app import ResAggApp -from webagg.utils import MementoUtils +from pywb.webagg.app import ResAggApp +from pywb.webagg.utils import MementoUtils from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.bufferedreaders import ChunkedDataReader @@ -19,12 +19,12 @@ from six.moves.urllib.parse import urlencode import webtest from fakeredis import FakeStrictRedis -from .testutils import to_path, FakeRedisTests, BaseTestClass +from .testutils import to_path, FakeRedisTests, BaseTestClass, TEST_CDX_PATH, TEST_WARC_PATH import json sources = { - 'local': DirectoryIndexSource(to_path('testdata/'), ''), + 'local': DirectoryIndexSource(TEST_CDX_PATH), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), 'live': LiveIndexSource(), @@ -41,15 +41,15 @@ class TestResAgg(FakeRedisTests, BaseTestClass): app.add_route('/live', live_handler) source1 = GeventTimeoutAggregator(sources) - handler1 = DefaultResourceHandler(source1, to_path('testdata/')) + handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH) app.add_route('/many', handler1) - source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))}) - handler2 = DefaultResourceHandler(source2, to_path('testdata/')) + source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')}) + handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH) app.add_route('/posttest', handler2) - source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) - handler3 = DefaultResourceHandler(source3, to_path('testdata/')) + source3 = SimpleAggregator({'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')}) + handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH) app.add_route('/fallback', HandlerSeq([handler3, handler2, @@ -63,7 +63,7 @@ class TestResAgg(FakeRedisTests, BaseTestClass): app.add_route('/empty', HandlerSeq([])) app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})])) - url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))}) + url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj')}) app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc')) cls.testapp = webtest.TestApp(app) @@ -329,7 +329,7 @@ foo=bar&test=abc""" def test_redis_warc_1(self): f = FakeStrictRedis.from_url('redis://localhost/2') - f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz') + f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/') @@ -337,8 +337,8 @@ foo=bar&test=abc""" def test_url_agnost(self): f = FakeStrictRedis.from_url('redis://localhost/2') - f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz') - f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz') + f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz') + f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz') resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo') @@ -390,22 +390,22 @@ host: www.youtube.com\ def test_error_redis_file_not_found(self): f = FakeStrictRedis.from_url('redis://localhost/2') - f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz') + f.hset('test:warc', 'example2.warc.gz', './x-no-such-dir/example2.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) - assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'" + assert resp.json['message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'" - f.hdel('test:warc', 'example.warc.gz') + f.hdel('test:warc', 'example2.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) - assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', - 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} + assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found', + 'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}} f.delete('test:warc') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) - assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', - 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} + assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found', + 'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}} def test_error_fallback_live_not_found(self): diff --git a/pywb/webagg/test/test_indexsource.py b/pywb/webagg/test/test_indexsource.py index 40dc825e..6171104b 100644 --- a/pywb/webagg/test/test_indexsource.py +++ b/pywb/webagg/test/test_indexsource.py @@ -1,14 +1,14 @@ -from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource -from webagg.indexsource import LiveIndexSource +from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource +from pywb.webagg.indexsource import LiveIndexSource -from webagg.aggregator import SimpleAggregator +from pywb.webagg.aggregator import SimpleAggregator from pywb.utils.timeutils import timestamp_now -from .testutils import key_ts_res - +from .testutils import key_ts_res, TEST_CDX_PATH import pytest +import os from fakeredis import FakeStrictRedis from mock import patch @@ -19,7 +19,7 @@ redismock.start() def setup_module(): r = FakeStrictRedis.from_url('redis://localhost:6379/2') r.delete('test:rediscdx') - with open('testdata/iana.cdxj', 'rb') as fh: + with open(TEST_CDX_PATH + 'iana.cdxj', 'rb') as fh: for line in fh: r.zadd('test:rediscdx', 0, line.rstrip()) @@ -29,7 +29,7 @@ def teardown_module(): local_sources = [ - FileIndexSource('testdata/iana.cdxj'), + FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), RedisIndexSource('redis://localhost:6379/2/test:rediscdx') ] diff --git a/pywb/webagg/test/test_inputreq.py b/pywb/webagg/test/test_inputreq.py index bdc47705..eb02f6f4 100644 --- a/pywb/webagg/test/test_inputreq.py +++ b/pywb/webagg/test/test_inputreq.py @@ -1,4 +1,4 @@ -from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest from bottle import Bottle, request, response, debug import webtest import traceback diff --git a/pywb/webagg/test/test_memento_agg.py b/pywb/webagg/test/test_memento_agg.py index 73bd0409..94d4aa91 100644 --- a/pywb/webagg/test/test_memento_agg.py +++ b/pywb/webagg/test/test_memento_agg.py @@ -1,28 +1,34 @@ from gevent import monkey; monkey.patch_all(thread=False) -from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator -from webagg.aggregator import BaseAggregator +from pywb.webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator +from pywb.webagg.aggregator import BaseAggregator -from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource -from .testutils import to_json_list, to_path +from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource +from .testutils import to_json_list, to_path, TEST_CDX_PATH import json import pytest import time import six +import yaml -from webagg.handlers import IndexHandler +from mock import patch + +from pywb.webagg.handlers import IndexHandler + +from pywb import get_test_dir +from pywb.utils.wbexception import NotFoundException +# Aggregator Mappings sources = { - 'local': FileIndexSource(to_path('testdata/iana.cdxj')), + 'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'), 'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*') } - aggs = {'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } @@ -34,13 +40,41 @@ agg_nf = {'simple': SimpleAggregator(nf), 'gevent': GeventTimeoutAggregator(nf, timeout=5.0), } +# Load expected link headers +link_header_data = None +def setup_module(): + global link_header_data + with open(to_path(get_test_dir() + '/text_content/link_headers.yaml')) as fh: + link_header_data = yaml.load(fh) + + +orig_get_timegate_links = MementoIndexSource.get_timegate_links + +def mock_link_header(test_name, load=False): + def mock_func(self, params, closest): + if load: + res = orig_get_timegate_links(self, params, closest) + print("'{0}': '{1}'".format(self.timegate_url, res)) + return res + + try: + res = link_header_data[test_name][self.timegate_url] + time.sleep(0.2) + except: + msg = self.timegate_url.format(url=params['url']) + raise NotFoundException(msg) + + return res + + return mock_func + @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_1')) def test_mem_agg_index_1(agg): url = 'http://iana.org/' res, errs = agg(dict(url=url, closest='20140126000000', limit=5)) - exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"}, {"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"}, {"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"}, @@ -53,23 +87,25 @@ def test_mem_agg_index_1(agg): 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_2')) def test_mem_agg_index_2(agg): url = 'http://example.com/' res, errs = agg(dict(url=url, closest='20100512', limit=6)) exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"}, {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"}, - #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"}, - {"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"}, - {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}, + {"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"}, + {"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"}, {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"}, - {"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}] + {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}, + ] assert(to_json_list(res) == exp) assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"}) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_3')) def test_mem_agg_index_3(agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=5)) @@ -85,6 +121,7 @@ def test_mem_agg_index_3(agg): @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_4')) def test_mem_agg_index_4(agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) diff --git a/pywb/webagg/test/test_redis_agg.py b/pywb/webagg/test/test_redis_agg.py index 505350f7..9aadf1df 100644 --- a/pywb/webagg/test/test_redis_agg.py +++ b/pywb/webagg/test/test_redis_agg.py @@ -1,13 +1,13 @@ -from webagg.aggregator import RedisMultiKeyIndexSource -from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass +from pywb.webagg.aggregator import RedisMultiKeyIndexSource +from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass, TEST_CDX_PATH class TestRedisAgg(FakeRedisTests, BaseTestClass): @classmethod def setup_class(cls): super(TestRedisAgg, cls).setup_class() - cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj') - cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj') + cls.add_cdx_to_redis(TEST_CDX_PATH + 'example2.cdxj', 'FOO:example:cdxj') + cls.add_cdx_to_redis(TEST_CDX_PATH + 'dupes.cdxj', 'FOO:dupes:cdxj') cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj') @@ -17,7 +17,7 @@ class TestRedisAgg(FakeRedisTests, BaseTestClass): exp = [ {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(errs == {}) diff --git a/pywb/webagg/test/test_timeouts.py b/pywb/webagg/test/test_timeouts.py index 60080ce6..980af85e 100644 --- a/pywb/webagg/test/test_timeouts.py +++ b/pywb/webagg/test/test_timeouts.py @@ -1,11 +1,11 @@ from gevent import monkey; monkey.patch_all(thread=False) import time -from webagg.indexsource import FileIndexSource +from pywb.webagg.indexsource import FileIndexSource -from webagg.aggregator import SimpleAggregator, TimeoutMixin -from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator +from pywb.webagg.aggregator import SimpleAggregator, TimeoutMixin +from pywb.webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator -from .testutils import to_json_list +from .testutils import to_json_list, TEST_CDX_PATH class TimeoutFileSource(FileIndexSource): @@ -26,8 +26,8 @@ TimeoutAggregator = GeventTimeoutAggregator def setup_module(): global sources - sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2), - 'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5) + sources = {'slow': TimeoutFileSource(TEST_CDX_PATH + 'example2.cdxj', 0.2), + 'slower': TimeoutFileSource(TEST_CDX_PATH + 'dupes.cdxj', 0.5) } diff --git a/pywb/webagg/test/test_upstream.py b/pywb/webagg/test/test_upstream.py index 59854f90..5dc32959 100644 --- a/pywb/webagg/test/test_upstream.py +++ b/pywb/webagg/test/test_upstream.py @@ -1,12 +1,12 @@ import webtest from io import BytesIO -from webagg.app import ResAggApp +from pywb.webagg.app import ResAggApp import requests -from webagg.handlers import DefaultResourceHandler -from webagg.aggregator import SimpleAggregator -from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource +from pywb.webagg.handlers import DefaultResourceHandler +from pywb.webagg.aggregator import SimpleAggregator +from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource from pywb.warc.recordloader import ArcWarcRecordLoader diff --git a/pywb/webagg/test/testutils.py b/pywb/webagg/test/testutils.py index c9ba5be0..63bde954 100644 --- a/pywb/webagg/test/testutils.py +++ b/pywb/webagg/test/testutils.py @@ -10,11 +10,12 @@ from mock import patch from wsgiref.simple_server import make_server -from webagg.aggregator import SimpleAggregator -from webagg.app import ResAggApp -from webagg.handlers import DefaultResourceHandler -from webagg.indexsource import LiveIndexSource +from pywb.webagg.aggregator import SimpleAggregator +from pywb.webagg.app import ResAggApp +from pywb.webagg.handlers import DefaultResourceHandler +from pywb.webagg.indexsource import LiveIndexSource +from pywb import get_test_dir # ============================================================================ def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): @@ -30,6 +31,11 @@ def to_path(path): return path +# ============================================================================ +TEST_CDX_PATH = to_path(get_test_dir() + '/cdxj/') +TEST_WARC_PATH = to_path(get_test_dir() + '/warcs/') + + # ============================================================================ class BaseTestClass(object): @classmethod diff --git a/testdata/dupes.cdxj b/sample_archive/cdxj/dupes.cdxj similarity index 100% rename from testdata/dupes.cdxj rename to sample_archive/cdxj/dupes.cdxj diff --git a/testdata/example.cdxj b/sample_archive/cdxj/example2.cdxj similarity index 87% rename from testdata/example.cdxj rename to sample_archive/cdxj/example2.cdxj index 72f092f5..1ea3a59a 100644 --- a/testdata/example.cdxj +++ b/sample_archive/cdxj/example2.cdxj @@ -1 +1 @@ -com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"} +com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example2.warc.gz"} diff --git a/testdata/iana.cdxj b/sample_archive/cdxj/iana.cdxj similarity index 100% rename from testdata/iana.cdxj rename to sample_archive/cdxj/iana.cdxj diff --git a/testdata/post-test.cdxj b/sample_archive/cdxj/post-test.cdxj similarity index 100% rename from testdata/post-test.cdxj rename to sample_archive/cdxj/post-test.cdxj diff --git a/testdata/url-agnost-example.cdxj b/sample_archive/cdxj/url-agnost-example.cdxj similarity index 100% rename from testdata/url-agnost-example.cdxj rename to sample_archive/cdxj/url-agnost-example.cdxj diff --git a/sample_archive/text_content/link_headers.yaml b/sample_archive/text_content/link_headers.yaml new file mode 100644 index 00000000..376a63f7 --- /dev/null +++ b/sample_archive/text_content/link_headers.yaml @@ -0,0 +1,31 @@ + +agg_test_1: + 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Wed, 10 Dec 1997 06:17:38 GMT", ; rel="prev memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT", ; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT", ; rel="next memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT", ; rel="last memento"; datetime="Mon, 07 Nov 2016 17:03:30 GMT"' + + 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 01 Oct 2006 07:22:32 GMT", ; rel="prev memento"; datetime="Fri, 13 Dec 2013 01:08:04 GMT", ; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT", ; rel="next memento"; datetime="Fri, 28 Mar 2014 21:32:03 GMT", ; rel="last memento"; datetime="Sun, 06 Nov 2016 01:47:05 GMT"' + +agg_test_2: + 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Mon, 10 May 2010 23:36:01 GMT", ; rel="memento"; datetime="Thu, 13 May 2010 22:41:08 GMT", ; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="last memento"; datetime="Tue, 08 Nov 2016 14:46:31 GMT"' + + 'http://www.webarchive.org.uk/wayback/archive/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Thu, 13 May 2010 01:00:14 GMT", ; rel="first memento"; datetime="Thu, 10 Apr 2008 12:57:03 GMT", ; rel="prev memento"; datetime="Wed, 12 May 2010 20:44:10 GMT"' + + 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Thu, 08 Oct 2009 22:20:31 GMT", ; rel="prev memento"; datetime="Tue, 27 Apr 2010 18:55:25 GMT", ; rel="memento"; datetime="Tue, 11 May 2010 20:11:51 GMT", ; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="last memento"; datetime="Tue, 08 Nov 2016 08:06:53 GMT"' + + +agg_test_3: + 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", ; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", ; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"' + + 'http://www.webarchive.org.uk/wayback/archive/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Wed, 28 Jul 2010 22:17:01 GMT", ; rel="prev first memento"; datetime="Sun, 24 Jan 2010 04:14:39 GMT"' + + 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", ; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", ; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"' + + + 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"' + + +agg_test_4: + 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", ; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", ; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"' + + 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"' + + diff --git a/testdata/example.warc.gz b/sample_archive/warcs/example2.warc.gz similarity index 100% rename from testdata/example.warc.gz rename to sample_archive/warcs/example2.warc.gz diff --git a/setup.py b/setup.py index 629ea228..7eafa305 100755 --- a/setup.py +++ b/setup.py @@ -58,7 +58,10 @@ setup( 'pywb.manager', 'pywb.perms', 'pywb.webapp', - 'pywb.apps' + 'pywb.apps', + 'pywb.webagg', + 'pywb.recorder', + 'pywb.urlrewrite' ], package_data={ 'pywb': ['static/flowplayer/*', 'static/*.*', 'templates/*', '*.yaml'], diff --git a/testdata/dupes.warc.gz b/testdata/dupes.warc.gz deleted file mode 100644 index 48e6b6fd..00000000 Binary files a/testdata/dupes.warc.gz and /dev/null differ diff --git a/testdata/example-url-agnostic-orig.warc.gz b/testdata/example-url-agnostic-orig.warc.gz deleted file mode 100644 index 98700373..00000000 Binary files a/testdata/example-url-agnostic-orig.warc.gz and /dev/null differ diff --git a/testdata/example-url-agnostic-revisit.warc.gz b/testdata/example-url-agnostic-revisit.warc.gz deleted file mode 100644 index 3770ed0a..00000000 Binary files a/testdata/example-url-agnostic-revisit.warc.gz and /dev/null differ diff --git a/testdata/iana.warc.gz b/testdata/iana.warc.gz deleted file mode 100644 index 3a88a71a..00000000 Binary files a/testdata/iana.warc.gz and /dev/null differ diff --git a/testdata/post-test.warc.gz b/testdata/post-test.warc.gz deleted file mode 100644 index b9cc1f48..00000000 Binary files a/testdata/post-test.warc.gz and /dev/null differ