diff --git a/pywb/__init__.py b/pywb/__init__.py
index c6233bbf..061a9bcc 100644
--- a/pywb/__init__.py
+++ b/pywb/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.33.0'
+__version__ = '0.50.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'
diff --git a/pywb/webagg/aggregator.py b/pywb/webagg/aggregator.py
index 9ca59b52..78c14b71 100644
--- a/pywb/webagg/aggregator.py
+++ b/pywb/webagg/aggregator.py
@@ -15,10 +15,10 @@ from heapq import merge
from collections import deque
from itertools import chain
-from webagg.indexsource import FileIndexSource, RedisIndexSource
+from pywb.webagg.indexsource import FileIndexSource, RedisIndexSource
from pywb.utils.wbexception import NotFoundException, WbException
-from webagg.utils import ParamFormatter, res_template
+from pywb.webagg.utils import ParamFormatter, res_template
import six
import glob
diff --git a/pywb/webagg/app.py b/pywb/webagg/app.py
index e045480b..b221c85d 100644
--- a/pywb/webagg/app.py
+++ b/pywb/webagg/app.py
@@ -1,4 +1,4 @@
-from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
+from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from werkzeug.routing import Map, Rule
import requests
diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py
index a8e067f3..b8d2bbfa 100644
--- a/pywb/webagg/handlers.py
+++ b/pywb/webagg/handlers.py
@@ -1,5 +1,5 @@
-from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
-from webagg.utils import MementoUtils
+from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
+from pywb.webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py
index a52bb11a..76adc2ab 100644
--- a/pywb/webagg/indexsource.py
+++ b/pywb/webagg/indexsource.py
@@ -8,11 +8,10 @@ from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
-#from webagg.liverec import patched_requests as requests
import requests
-from webagg.utils import ParamFormatter, res_template
-from webagg.utils import MementoUtils
+from pywb.webagg.utils import ParamFormatter, res_template
+from pywb.webagg.utils import MementoUtils
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
diff --git a/pywb/webagg/proxyindexsource.py b/pywb/webagg/proxyindexsource.py
index 435c9240..741f116b 100644
--- a/pywb/webagg/proxyindexsource.py
+++ b/pywb/webagg/proxyindexsource.py
@@ -1,8 +1,8 @@
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.wbexception import NotFoundException
-from webagg.indexsource import BaseIndexSource, RemoteIndexSource
-from webagg.responseloader import LiveWebLoader
-from webagg.utils import ParamFormatter, res_template
+from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
+from pywb.webagg.responseloader import LiveWebLoader
+from pywb.webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now
diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py
index ecda0723..ecebe82a 100644
--- a/pywb/webagg/responseloader.py
+++ b/pywb/webagg/responseloader.py
@@ -1,6 +1,6 @@
-from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
-from webagg.utils import ParamFormatter
-from webagg.indexsource import RedisIndexSource
+from pywb.webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
+from pywb.webagg.utils import ParamFormatter
+from pywb.webagg.indexsource import RedisIndexSource
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
diff --git a/pywb/webagg/test/live.py b/pywb/webagg/test/live.py
index 2e4f84a9..cec4564c 100644
--- a/pywb/webagg/test/live.py
+++ b/pywb/webagg/test/live.py
@@ -1,10 +1,10 @@
from gevent.monkey import patch_all; patch_all()
-from webagg.test.testutils import LiveServerTests
-from webagg.handlers import DefaultResourceHandler
-from webagg.app import ResAggApp
-from webagg.indexsource import LiveIndexSource, RedisIndexSource
-from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
+from pywb.webagg.test.testutils import LiveServerTests
+from pywb.webagg.handlers import DefaultResourceHandler
+from pywb.webagg.app import ResAggApp
+from pywb.webagg.indexsource import LiveIndexSource, RedisIndexSource
+from pywb.webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
def simpleapp():
app = ResAggApp(debug=True)
diff --git a/pywb/webagg/test/test_dir_agg.py b/pywb/webagg/test/test_dir_agg.py
index bce07046..0b1c521c 100644
--- a/pywb/webagg/test/test_dir_agg.py
+++ b/pywb/webagg/test/test_dir_agg.py
@@ -3,15 +3,15 @@ import os
import shutil
import json
-from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
+from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass, TEST_CDX_PATH
from mock import patch
import time
-from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
-from webagg.aggregator import SimpleAggregator
-from webagg.indexsource import MementoIndexSource
+from pywb.webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
+from pywb.webagg.aggregator import SimpleAggregator
+from pywb.webagg.indexsource import MementoIndexSource
#=============================================================================
@@ -39,9 +39,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
dir_prefix = to_path(cls.root_dir)
dir_path ='colls/{coll}/indexes'
- shutil.copy(to_path('testdata/example.cdxj'), coll_A)
- shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
- shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
+ shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
+ shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
+ shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C)
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
fh.write('foo')
@@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
- exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
+ exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@@ -108,13 +108,13 @@ class TestDirAgg(TempDirTests, BaseTestClass):
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
- {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
+ {'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
- @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
+ @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento(self):
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': self.dir_loader}
@@ -128,7 +128,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
- {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
+ {'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@@ -156,7 +156,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
- exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
+ exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
@@ -166,7 +166,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
- exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
+ exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
@@ -177,7 +177,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
- exp = {'sources': {'example.cdxj': 'file'}}
+ exp = {'sources': {'example2.cdxj': 'file'}}
assert(res == exp)
@@ -193,7 +193,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
- exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
+ exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py
index 6fb5c8d8..5eed24ac 100644
--- a/pywb/webagg/test/test_handlers.py
+++ b/pywb/webagg/test/test_handlers.py
@@ -2,14 +2,14 @@
from collections import OrderedDict
-from webagg.handlers import DefaultResourceHandler, HandlerSeq
+from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
-from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
-from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
-from webagg.aggregator import DirectoryIndexSource
+from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
+from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
+from pywb.webagg.aggregator import DirectoryIndexSource
-from webagg.app import ResAggApp
-from webagg.utils import MementoUtils
+from pywb.webagg.app import ResAggApp
+from pywb.webagg.utils import MementoUtils
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
@@ -19,12 +19,12 @@ from six.moves.urllib.parse import urlencode
import webtest
from fakeredis import FakeStrictRedis
-from .testutils import to_path, FakeRedisTests, BaseTestClass
+from .testutils import to_path, FakeRedisTests, BaseTestClass, TEST_CDX_PATH, TEST_WARC_PATH
import json
sources = {
- 'local': DirectoryIndexSource(to_path('testdata/'), ''),
+ 'local': DirectoryIndexSource(TEST_CDX_PATH),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
'live': LiveIndexSource(),
@@ -41,15 +41,15 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
app.add_route('/live', live_handler)
source1 = GeventTimeoutAggregator(sources)
- handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
+ handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
app.add_route('/many', handler1)
- source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
- handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
+ source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
+ handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
app.add_route('/posttest', handler2)
- source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
- handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
+ source3 = SimpleAggregator({'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')})
+ handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH)
app.add_route('/fallback', HandlerSeq([handler3,
handler2,
@@ -63,7 +63,7 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
app.add_route('/empty', HandlerSeq([]))
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
- url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
+ url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj')})
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
cls.testapp = webtest.TestApp(app)
@@ -329,7 +329,7 @@ foo=bar&test=abc"""
def test_redis_warc_1(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
- f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
+ f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
@@ -337,8 +337,8 @@ foo=bar&test=abc"""
def test_url_agnost(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
- f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
- f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
+ f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
+ f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo')
@@ -390,22 +390,22 @@ host: www.youtube.com\
def test_error_redis_file_not_found(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
- f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
+ f.hset('test:warc', 'example2.warc.gz', './x-no-such-dir/example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
- assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
+ assert resp.json['message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'"
- f.hdel('test:warc', 'example.warc.gz')
+ f.hdel('test:warc', 'example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
- assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
- 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
+ assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
+ 'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
f.delete('test:warc')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
- assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
- 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
+ assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
+ 'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
def test_error_fallback_live_not_found(self):
diff --git a/pywb/webagg/test/test_indexsource.py b/pywb/webagg/test/test_indexsource.py
index 40dc825e..6171104b 100644
--- a/pywb/webagg/test/test_indexsource.py
+++ b/pywb/webagg/test/test_indexsource.py
@@ -1,14 +1,14 @@
-from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
-from webagg.indexsource import LiveIndexSource
+from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
+from pywb.webagg.indexsource import LiveIndexSource
-from webagg.aggregator import SimpleAggregator
+from pywb.webagg.aggregator import SimpleAggregator
from pywb.utils.timeutils import timestamp_now
-from .testutils import key_ts_res
-
+from .testutils import key_ts_res, TEST_CDX_PATH
import pytest
+import os
from fakeredis import FakeStrictRedis
from mock import patch
@@ -19,7 +19,7 @@ redismock.start()
def setup_module():
r = FakeStrictRedis.from_url('redis://localhost:6379/2')
r.delete('test:rediscdx')
- with open('testdata/iana.cdxj', 'rb') as fh:
+ with open(TEST_CDX_PATH + 'iana.cdxj', 'rb') as fh:
for line in fh:
r.zadd('test:rediscdx', 0, line.rstrip())
@@ -29,7 +29,7 @@ def teardown_module():
local_sources = [
- FileIndexSource('testdata/iana.cdxj'),
+ FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
RedisIndexSource('redis://localhost:6379/2/test:rediscdx')
]
diff --git a/pywb/webagg/test/test_inputreq.py b/pywb/webagg/test/test_inputreq.py
index bdc47705..eb02f6f4 100644
--- a/pywb/webagg/test/test_inputreq.py
+++ b/pywb/webagg/test/test_inputreq.py
@@ -1,4 +1,4 @@
-from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
+from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import Bottle, request, response, debug
import webtest
import traceback
diff --git a/pywb/webagg/test/test_memento_agg.py b/pywb/webagg/test/test_memento_agg.py
index 73bd0409..94d4aa91 100644
--- a/pywb/webagg/test/test_memento_agg.py
+++ b/pywb/webagg/test/test_memento_agg.py
@@ -1,28 +1,34 @@
from gevent import monkey; monkey.patch_all(thread=False)
-from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
-from webagg.aggregator import BaseAggregator
+from pywb.webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
+from pywb.webagg.aggregator import BaseAggregator
-from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
-from .testutils import to_json_list, to_path
+from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
+from .testutils import to_json_list, to_path, TEST_CDX_PATH
import json
import pytest
import time
import six
+import yaml
-from webagg.handlers import IndexHandler
+from mock import patch
+
+from pywb.webagg.handlers import IndexHandler
+
+from pywb import get_test_dir
+from pywb.utils.wbexception import NotFoundException
+# Aggregator Mappings
sources = {
- 'local': FileIndexSource(to_path('testdata/iana.cdxj')),
+ 'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
}
-
aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}
@@ -34,13 +40,41 @@ agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
}
+# Load expected link headers
+link_header_data = None
+def setup_module():
+ global link_header_data
+ with open(to_path(get_test_dir() + '/text_content/link_headers.yaml')) as fh:
+ link_header_data = yaml.load(fh)
+
+
+orig_get_timegate_links = MementoIndexSource.get_timegate_links
+
+def mock_link_header(test_name, load=False):
+ def mock_func(self, params, closest):
+ if load:
+ res = orig_get_timegate_links(self, params, closest)
+ print("'{0}': '{1}'".format(self.timegate_url, res))
+ return res
+
+ try:
+ res = link_header_data[test_name][self.timegate_url]
+ time.sleep(0.2)
+ except:
+ msg = self.timegate_url.format(url=params['url'])
+ raise NotFoundException(msg)
+
+ return res
+
+ return mock_func
+
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_1'))
def test_mem_agg_index_1(agg):
url = 'http://iana.org/'
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
-
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
@@ -53,23 +87,25 @@ def test_mem_agg_index_1(agg):
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_2'))
def test_mem_agg_index_2(agg):
url = 'http://example.com/'
res, errs = agg(dict(url=url, closest='20100512', limit=6))
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
- #{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
- {"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
- {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
+ {"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"},
+ {"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
- {"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}]
+ {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
+ ]
assert(to_json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_3'))
def test_mem_agg_index_3(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=5))
@@ -85,6 +121,7 @@ def test_mem_agg_index_3(agg):
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_4'))
def test_mem_agg_index_4(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
diff --git a/pywb/webagg/test/test_redis_agg.py b/pywb/webagg/test/test_redis_agg.py
index 505350f7..9aadf1df 100644
--- a/pywb/webagg/test/test_redis_agg.py
+++ b/pywb/webagg/test/test_redis_agg.py
@@ -1,13 +1,13 @@
-from webagg.aggregator import RedisMultiKeyIndexSource
-from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass
+from pywb.webagg.aggregator import RedisMultiKeyIndexSource
+from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass, TEST_CDX_PATH
class TestRedisAgg(FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRedisAgg, cls).setup_class()
- cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj')
- cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj')
+ cls.add_cdx_to_redis(TEST_CDX_PATH + 'example2.cdxj', 'FOO:example:cdxj')
+ cls.add_cdx_to_redis(TEST_CDX_PATH + 'dupes.cdxj', 'FOO:dupes:cdxj')
cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj')
@@ -17,7 +17,7 @@ class TestRedisAgg(FakeRedisTests, BaseTestClass):
exp = [
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
- {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
+ {'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(errs == {})
diff --git a/pywb/webagg/test/test_timeouts.py b/pywb/webagg/test/test_timeouts.py
index 60080ce6..980af85e 100644
--- a/pywb/webagg/test/test_timeouts.py
+++ b/pywb/webagg/test/test_timeouts.py
@@ -1,11 +1,11 @@
from gevent import monkey; monkey.patch_all(thread=False)
import time
-from webagg.indexsource import FileIndexSource
+from pywb.webagg.indexsource import FileIndexSource
-from webagg.aggregator import SimpleAggregator, TimeoutMixin
-from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
+from pywb.webagg.aggregator import SimpleAggregator, TimeoutMixin
+from pywb.webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
-from .testutils import to_json_list
+from .testutils import to_json_list, TEST_CDX_PATH
class TimeoutFileSource(FileIndexSource):
@@ -26,8 +26,8 @@ TimeoutAggregator = GeventTimeoutAggregator
def setup_module():
global sources
- sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2),
- 'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5)
+ sources = {'slow': TimeoutFileSource(TEST_CDX_PATH + 'example2.cdxj', 0.2),
+ 'slower': TimeoutFileSource(TEST_CDX_PATH + 'dupes.cdxj', 0.5)
}
diff --git a/pywb/webagg/test/test_upstream.py b/pywb/webagg/test/test_upstream.py
index 59854f90..5dc32959 100644
--- a/pywb/webagg/test/test_upstream.py
+++ b/pywb/webagg/test/test_upstream.py
@@ -1,12 +1,12 @@
import webtest
from io import BytesIO
-from webagg.app import ResAggApp
+from pywb.webagg.app import ResAggApp
import requests
-from webagg.handlers import DefaultResourceHandler
-from webagg.aggregator import SimpleAggregator
-from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
+from pywb.webagg.handlers import DefaultResourceHandler
+from pywb.webagg.aggregator import SimpleAggregator
+from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.warc.recordloader import ArcWarcRecordLoader
diff --git a/pywb/webagg/test/testutils.py b/pywb/webagg/test/testutils.py
index c9ba5be0..63bde954 100644
--- a/pywb/webagg/test/testutils.py
+++ b/pywb/webagg/test/testutils.py
@@ -10,11 +10,12 @@ from mock import patch
from wsgiref.simple_server import make_server
-from webagg.aggregator import SimpleAggregator
-from webagg.app import ResAggApp
-from webagg.handlers import DefaultResourceHandler
-from webagg.indexsource import LiveIndexSource
+from pywb.webagg.aggregator import SimpleAggregator
+from pywb.webagg.app import ResAggApp
+from pywb.webagg.handlers import DefaultResourceHandler
+from pywb.webagg.indexsource import LiveIndexSource
+from pywb import get_test_dir
# ============================================================================
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
@@ -30,6 +31,11 @@ def to_path(path):
return path
+# ============================================================================
+TEST_CDX_PATH = to_path(get_test_dir() + '/cdxj/')
+TEST_WARC_PATH = to_path(get_test_dir() + '/warcs/')
+
+
# ============================================================================
class BaseTestClass(object):
@classmethod
diff --git a/testdata/dupes.cdxj b/sample_archive/cdxj/dupes.cdxj
similarity index 100%
rename from testdata/dupes.cdxj
rename to sample_archive/cdxj/dupes.cdxj
diff --git a/testdata/example.cdxj b/sample_archive/cdxj/example2.cdxj
similarity index 87%
rename from testdata/example.cdxj
rename to sample_archive/cdxj/example2.cdxj
index 72f092f5..1ea3a59a 100644
--- a/testdata/example.cdxj
+++ b/sample_archive/cdxj/example2.cdxj
@@ -1 +1 @@
-com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"}
+com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example2.warc.gz"}
diff --git a/testdata/iana.cdxj b/sample_archive/cdxj/iana.cdxj
similarity index 100%
rename from testdata/iana.cdxj
rename to sample_archive/cdxj/iana.cdxj
diff --git a/testdata/post-test.cdxj b/sample_archive/cdxj/post-test.cdxj
similarity index 100%
rename from testdata/post-test.cdxj
rename to sample_archive/cdxj/post-test.cdxj
diff --git a/testdata/url-agnost-example.cdxj b/sample_archive/cdxj/url-agnost-example.cdxj
similarity index 100%
rename from testdata/url-agnost-example.cdxj
rename to sample_archive/cdxj/url-agnost-example.cdxj
diff --git a/sample_archive/text_content/link_headers.yaml b/sample_archive/text_content/link_headers.yaml
new file mode 100644
index 00000000..376a63f7
--- /dev/null
+++ b/sample_archive/text_content/link_headers.yaml
@@ -0,0 +1,31 @@
+
+agg_test_1:
+ 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Wed, 10 Dec 1997 06:17:38 GMT", ; rel="prev memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT", ; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT", ; rel="next memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT", ; rel="last memento"; datetime="Mon, 07 Nov 2016 17:03:30 GMT"'
+
+ 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 01 Oct 2006 07:22:32 GMT", ; rel="prev memento"; datetime="Fri, 13 Dec 2013 01:08:04 GMT", ; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT", ; rel="next memento"; datetime="Fri, 28 Mar 2014 21:32:03 GMT", ; rel="last memento"; datetime="Sun, 06 Nov 2016 01:47:05 GMT"'
+
+agg_test_2:
+ 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Mon, 10 May 2010 23:36:01 GMT", ; rel="memento"; datetime="Thu, 13 May 2010 22:41:08 GMT", ; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="last memento"; datetime="Tue, 08 Nov 2016 14:46:31 GMT"'
+
+ 'http://www.webarchive.org.uk/wayback/archive/{url}': '; rel="original", /www.webarchive.org.uk/wayback/archive/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", /www.webarchive.org.uk/wayback/archive/20100513010014/http://example.com/>; rel="last memento"; datetime="Thu, 13 May 2010 01:00:14 GMT", /www.webarchive.org.uk/wayback/archive/20080410125703/http://example.com/>; rel="first memento"; datetime="Thu, 10 Apr 2008 12:57:03 GMT", /www.webarchive.org.uk/wayback/archive/20100512204410/http://example.com/>; rel="prev memento"; datetime="Wed, 12 May 2010 20:44:10 GMT"'
+
+ 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Thu, 08 Oct 2009 22:20:31 GMT", ; rel="prev memento"; datetime="Tue, 27 Apr 2010 18:55:25 GMT", ; rel="memento"; datetime="Tue, 11 May 2010 20:11:51 GMT", ; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="last memento"; datetime="Tue, 08 Nov 2016 08:06:53 GMT"'
+
+
+agg_test_3:
+ 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", ; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", ; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'
+
+ 'http://www.webarchive.org.uk/wayback/archive/{url}': '; rel="original", /www.webarchive.org.uk/wayback/archive/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", /www.webarchive.org.uk/wayback/archive/20100728221701/http://vvork.com/>; rel="last memento"; datetime="Wed, 28 Jul 2010 22:17:01 GMT", /www.webarchive.org.uk/wayback/archive/20100124041439/http://vvork.com/>; rel="prev first memento"; datetime="Sun, 24 Jan 2010 04:14:39 GMT"'
+
+ 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", ; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", ; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
+
+
+ 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"'
+
+
+agg_test_4:
+ 'http://wayback.archive-it.org/all/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", ; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", ; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
+
+ 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"'
+
+
diff --git a/testdata/example.warc.gz b/sample_archive/warcs/example2.warc.gz
similarity index 100%
rename from testdata/example.warc.gz
rename to sample_archive/warcs/example2.warc.gz
diff --git a/setup.py b/setup.py
index 629ea228..7eafa305 100755
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,10 @@ setup(
'pywb.manager',
'pywb.perms',
'pywb.webapp',
- 'pywb.apps'
+ 'pywb.apps',
+ 'pywb.webagg',
+ 'pywb.recorder',
+ 'pywb.urlrewrite'
],
package_data={
'pywb': ['static/flowplayer/*', 'static/*.*', 'templates/*', '*.yaml'],
diff --git a/testdata/dupes.warc.gz b/testdata/dupes.warc.gz
deleted file mode 100644
index 48e6b6fd..00000000
Binary files a/testdata/dupes.warc.gz and /dev/null differ
diff --git a/testdata/example-url-agnostic-orig.warc.gz b/testdata/example-url-agnostic-orig.warc.gz
deleted file mode 100644
index 98700373..00000000
Binary files a/testdata/example-url-agnostic-orig.warc.gz and /dev/null differ
diff --git a/testdata/example-url-agnostic-revisit.warc.gz b/testdata/example-url-agnostic-revisit.warc.gz
deleted file mode 100644
index 3770ed0a..00000000
Binary files a/testdata/example-url-agnostic-revisit.warc.gz and /dev/null differ
diff --git a/testdata/iana.warc.gz b/testdata/iana.warc.gz
deleted file mode 100644
index 3a88a71a..00000000
Binary files a/testdata/iana.warc.gz and /dev/null differ
diff --git a/testdata/post-test.warc.gz b/testdata/post-test.warc.gz
deleted file mode 100644
index b9cc1f48..00000000
Binary files a/testdata/post-test.warc.gz and /dev/null differ