mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor: fix pywb.webagg package paths
all webagg tests working! move testdata cdxj into sample_archive, remove rest (duplicates) #200
This commit is contained in:
parent
99e5008ac0
commit
6b4b038471
@ -1,4 +1,4 @@
|
||||
__version__ = '0.33.0'
|
||||
__version__ = '0.50.0'
|
||||
|
||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||
|
||||
|
@ -15,10 +15,10 @@ from heapq import merge
|
||||
from collections import deque
|
||||
from itertools import chain
|
||||
|
||||
from webagg.indexsource import FileIndexSource, RedisIndexSource
|
||||
from pywb.webagg.indexsource import FileIndexSource, RedisIndexSource
|
||||
from pywb.utils.wbexception import NotFoundException, WbException
|
||||
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from pywb.webagg.utils import ParamFormatter, res_template
|
||||
|
||||
import six
|
||||
import glob
|
||||
|
@ -1,4 +1,4 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from werkzeug.routing import Map, Rule
|
||||
|
||||
import requests
|
||||
|
@ -1,5 +1,5 @@
|
||||
from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
|
||||
from webagg.utils import MementoUtils
|
||||
from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
|
@ -8,11 +8,10 @@ from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
#from webagg.liverec import patched_requests as requests
|
||||
import requests
|
||||
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from webagg.utils import MementoUtils
|
||||
from pywb.webagg.utils import ParamFormatter, res_template
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
|
||||
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
|
||||
|
@ -1,8 +1,8 @@
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||
from webagg.responseloader import LiveWebLoader
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
|
||||
from pywb.webagg.responseloader import LiveWebLoader
|
||||
from pywb.webagg.utils import ParamFormatter, res_template
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
|
||||
from webagg.utils import ParamFormatter
|
||||
from webagg.indexsource import RedisIndexSource
|
||||
from pywb.webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
|
||||
from pywb.webagg.utils import ParamFormatter
|
||||
from pywb.webagg.indexsource import RedisIndexSource
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
|
||||
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date
|
||||
|
@ -1,10 +1,10 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from webagg.test.testutils import LiveServerTests
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.indexsource import LiveIndexSource, RedisIndexSource
|
||||
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
|
||||
from pywb.webagg.test.testutils import LiveServerTests
|
||||
from pywb.webagg.handlers import DefaultResourceHandler
|
||||
from pywb.webagg.app import ResAggApp
|
||||
from pywb.webagg.indexsource import LiveIndexSource, RedisIndexSource
|
||||
from pywb.webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
|
||||
|
||||
def simpleapp():
|
||||
app = ResAggApp(debug=True)
|
||||
|
@ -3,15 +3,15 @@ import os
|
||||
import shutil
|
||||
import json
|
||||
|
||||
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
|
||||
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass, TEST_CDX_PATH
|
||||
|
||||
from mock import patch
|
||||
|
||||
import time
|
||||
|
||||
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.indexsource import MementoIndexSource
|
||||
from pywb.webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.indexsource import MementoIndexSource
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -39,9 +39,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
dir_prefix = to_path(cls.root_dir)
|
||||
dir_path ='colls/{coll}/indexes'
|
||||
|
||||
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
|
||||
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
||||
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
||||
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
|
||||
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
|
||||
shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C)
|
||||
|
||||
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
|
||||
fh.write('foo')
|
||||
@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
def test_agg_collA_found(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
|
||||
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
||||
exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -108,13 +108,13 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
exp = [
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
|
||||
def test_agg_dir_and_memento(self):
|
||||
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'local': self.dir_loader}
|
||||
@ -128,7 +128,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
{'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -156,7 +156,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
def test_agg_dir_sources_1(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
@ -166,7 +166,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
def test_agg_dir_sources_2(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
@ -177,7 +177,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {'example.cdxj': 'file'}}
|
||||
exp = {'sources': {'example2.cdxj': 'file'}}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
@ -193,7 +193,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
|
||||
def test_cache_dir_sources_1(self):
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
@ -2,14 +2,14 @@
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from webagg.handlers import DefaultResourceHandler, HandlerSeq
|
||||
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
|
||||
|
||||
from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
||||
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from webagg.aggregator import DirectoryIndexSource
|
||||
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
||||
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from pywb.webagg.aggregator import DirectoryIndexSource
|
||||
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.utils import MementoUtils
|
||||
from pywb.webagg.app import ResAggApp
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
@ -19,12 +19,12 @@ from six.moves.urllib.parse import urlencode
|
||||
import webtest
|
||||
from fakeredis import FakeStrictRedis
|
||||
|
||||
from .testutils import to_path, FakeRedisTests, BaseTestClass
|
||||
from .testutils import to_path, FakeRedisTests, BaseTestClass, TEST_CDX_PATH, TEST_WARC_PATH
|
||||
|
||||
import json
|
||||
|
||||
sources = {
|
||||
'local': DirectoryIndexSource(to_path('testdata/'), ''),
|
||||
'local': DirectoryIndexSource(TEST_CDX_PATH),
|
||||
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
|
||||
'live': LiveIndexSource(),
|
||||
@ -41,15 +41,15 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
|
||||
app.add_route('/live', live_handler)
|
||||
|
||||
source1 = GeventTimeoutAggregator(sources)
|
||||
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
|
||||
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
|
||||
app.add_route('/many', handler1)
|
||||
|
||||
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
|
||||
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
|
||||
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
|
||||
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
|
||||
app.add_route('/posttest', handler2)
|
||||
|
||||
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
||||
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
||||
source3 = SimpleAggregator({'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')})
|
||||
handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH)
|
||||
|
||||
app.add_route('/fallback', HandlerSeq([handler3,
|
||||
handler2,
|
||||
@ -63,7 +63,7 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
|
||||
app.add_route('/empty', HandlerSeq([]))
|
||||
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
|
||||
|
||||
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
|
||||
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj')})
|
||||
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
|
||||
|
||||
cls.testapp = webtest.TestApp(app)
|
||||
@ -329,7 +329,7 @@ foo=bar&test=abc"""
|
||||
|
||||
def test_redis_warc_1(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
|
||||
f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
|
||||
|
||||
@ -337,8 +337,8 @@ foo=bar&test=abc"""
|
||||
|
||||
def test_url_agnost(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
|
||||
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/¶m.arg=foo')
|
||||
|
||||
@ -390,22 +390,22 @@ host: www.youtube.com\
|
||||
|
||||
def test_error_redis_file_not_found(self):
|
||||
f = FakeStrictRedis.from_url('redis://localhost/2')
|
||||
f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
|
||||
f.hset('test:warc', 'example2.warc.gz', './x-no-such-dir/example2.warc.gz')
|
||||
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
|
||||
assert resp.json['message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'"
|
||||
|
||||
f.hdel('test:warc', 'example.warc.gz')
|
||||
f.hdel('test:warc', 'example2.warc.gz')
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
|
||||
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
|
||||
assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
|
||||
|
||||
f.delete('test:warc')
|
||||
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
|
||||
|
||||
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
|
||||
assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
|
||||
'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
|
||||
|
||||
|
||||
def test_error_fallback_live_not_found(self):
|
||||
|
@ -1,14 +1,14 @@
|
||||
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
||||
from webagg.indexsource import LiveIndexSource
|
||||
from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
||||
from pywb.webagg.indexsource import LiveIndexSource
|
||||
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
from .testutils import key_ts_res
|
||||
|
||||
from .testutils import key_ts_res, TEST_CDX_PATH
|
||||
|
||||
import pytest
|
||||
import os
|
||||
|
||||
from fakeredis import FakeStrictRedis
|
||||
from mock import patch
|
||||
@ -19,7 +19,7 @@ redismock.start()
|
||||
def setup_module():
|
||||
r = FakeStrictRedis.from_url('redis://localhost:6379/2')
|
||||
r.delete('test:rediscdx')
|
||||
with open('testdata/iana.cdxj', 'rb') as fh:
|
||||
with open(TEST_CDX_PATH + 'iana.cdxj', 'rb') as fh:
|
||||
for line in fh:
|
||||
r.zadd('test:rediscdx', 0, line.rstrip())
|
||||
|
||||
@ -29,7 +29,7 @@ def teardown_module():
|
||||
|
||||
|
||||
local_sources = [
|
||||
FileIndexSource('testdata/iana.cdxj'),
|
||||
FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
|
||||
RedisIndexSource('redis://localhost:6379/2/test:rediscdx')
|
||||
]
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from bottle import Bottle, request, response, debug
|
||||
import webtest
|
||||
import traceback
|
||||
|
@ -1,28 +1,34 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
|
||||
from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
|
||||
from webagg.aggregator import BaseAggregator
|
||||
from pywb.webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
|
||||
from pywb.webagg.aggregator import BaseAggregator
|
||||
|
||||
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
|
||||
from .testutils import to_json_list, to_path
|
||||
from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
|
||||
from .testutils import to_json_list, to_path, TEST_CDX_PATH
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import time
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from webagg.handlers import IndexHandler
|
||||
from mock import patch
|
||||
|
||||
from pywb.webagg.handlers import IndexHandler
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
|
||||
# Aggregator Mappings
|
||||
sources = {
|
||||
'local': FileIndexSource(to_path('testdata/iana.cdxj')),
|
||||
'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
|
||||
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
|
||||
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
|
||||
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
|
||||
}
|
||||
|
||||
|
||||
aggs = {'simple': SimpleAggregator(sources),
|
||||
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
|
||||
}
|
||||
@ -34,13 +40,41 @@ agg_nf = {'simple': SimpleAggregator(nf),
|
||||
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
|
||||
}
|
||||
|
||||
# Load expected link headers
|
||||
link_header_data = None
|
||||
def setup_module():
|
||||
global link_header_data
|
||||
with open(to_path(get_test_dir() + '/text_content/link_headers.yaml')) as fh:
|
||||
link_header_data = yaml.load(fh)
|
||||
|
||||
|
||||
orig_get_timegate_links = MementoIndexSource.get_timegate_links
|
||||
|
||||
def mock_link_header(test_name, load=False):
|
||||
def mock_func(self, params, closest):
|
||||
if load:
|
||||
res = orig_get_timegate_links(self, params, closest)
|
||||
print("'{0}': '{1}'".format(self.timegate_url, res))
|
||||
return res
|
||||
|
||||
try:
|
||||
res = link_header_data[test_name][self.timegate_url]
|
||||
time.sleep(0.2)
|
||||
except:
|
||||
msg = self.timegate_url.format(url=params['url'])
|
||||
raise NotFoundException(msg)
|
||||
|
||||
return res
|
||||
|
||||
return mock_func
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_1'))
|
||||
def test_mem_agg_index_1(agg):
|
||||
url = 'http://iana.org/'
|
||||
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||
|
||||
|
||||
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
|
||||
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
|
||||
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
|
||||
@ -53,23 +87,25 @@ def test_mem_agg_index_1(agg):
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_2'))
|
||||
def test_mem_agg_index_2(agg):
|
||||
url = 'http://example.com/'
|
||||
res, errs = agg(dict(url=url, closest='20100512', limit=6))
|
||||
|
||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"},
|
||||
{"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
||||
{"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}]
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_3'))
|
||||
def test_mem_agg_index_3(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=5))
|
||||
@ -85,6 +121,7 @@ def test_mem_agg_index_3(agg):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_4'))
|
||||
def test_mem_agg_index_4(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
@ -1,13 +1,13 @@
|
||||
from webagg.aggregator import RedisMultiKeyIndexSource
|
||||
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass
|
||||
from pywb.webagg.aggregator import RedisMultiKeyIndexSource
|
||||
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass, TEST_CDX_PATH
|
||||
|
||||
|
||||
class TestRedisAgg(FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRedisAgg, cls).setup_class()
|
||||
cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj')
|
||||
cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj')
|
||||
cls.add_cdx_to_redis(TEST_CDX_PATH + 'example2.cdxj', 'FOO:example:cdxj')
|
||||
cls.add_cdx_to_redis(TEST_CDX_PATH + 'dupes.cdxj', 'FOO:dupes:cdxj')
|
||||
|
||||
cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj')
|
||||
|
||||
@ -17,7 +17,7 @@ class TestRedisAgg(FakeRedisTests, BaseTestClass):
|
||||
exp = [
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
]
|
||||
|
||||
assert(errs == {})
|
||||
|
@ -1,11 +1,11 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
import time
|
||||
from webagg.indexsource import FileIndexSource
|
||||
from pywb.webagg.indexsource import FileIndexSource
|
||||
|
||||
from webagg.aggregator import SimpleAggregator, TimeoutMixin
|
||||
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
|
||||
from pywb.webagg.aggregator import SimpleAggregator, TimeoutMixin
|
||||
from pywb.webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
|
||||
|
||||
from .testutils import to_json_list
|
||||
from .testutils import to_json_list, TEST_CDX_PATH
|
||||
|
||||
|
||||
class TimeoutFileSource(FileIndexSource):
|
||||
@ -26,8 +26,8 @@ TimeoutAggregator = GeventTimeoutAggregator
|
||||
|
||||
def setup_module():
|
||||
global sources
|
||||
sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2),
|
||||
'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5)
|
||||
sources = {'slow': TimeoutFileSource(TEST_CDX_PATH + 'example2.cdxj', 0.2),
|
||||
'slower': TimeoutFileSource(TEST_CDX_PATH + 'dupes.cdxj', 0.5)
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
import webtest
|
||||
|
||||
from io import BytesIO
|
||||
from webagg.app import ResAggApp
|
||||
from pywb.webagg.app import ResAggApp
|
||||
import requests
|
||||
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
||||
from pywb.webagg.handlers import DefaultResourceHandler
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
|
@ -10,11 +10,12 @@ from mock import patch
|
||||
|
||||
from wsgiref.simple_server import make_server
|
||||
|
||||
from webagg.aggregator import SimpleAggregator
|
||||
from webagg.app import ResAggApp
|
||||
from webagg.handlers import DefaultResourceHandler
|
||||
from webagg.indexsource import LiveIndexSource
|
||||
from pywb.webagg.aggregator import SimpleAggregator
|
||||
from pywb.webagg.app import ResAggApp
|
||||
from pywb.webagg.handlers import DefaultResourceHandler
|
||||
from pywb.webagg.indexsource import LiveIndexSource
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
# ============================================================================
|
||||
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
|
||||
@ -30,6 +31,11 @@ def to_path(path):
|
||||
return path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
TEST_CDX_PATH = to_path(get_test_dir() + '/cdxj/')
|
||||
TEST_WARC_PATH = to_path(get_test_dir() + '/warcs/')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BaseTestClass(object):
|
||||
@classmethod
|
||||
|
@ -1 +1 @@
|
||||
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"}
|
||||
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example2.warc.gz"}
|
31
sample_archive/text_content/link_headers.yaml
Normal file
31
sample_archive/text_content/link_headers.yaml
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
agg_test_1:
|
||||
'http://web.archive.org/web/{url}': '<http://iana.org/>; rel="original", <http://web.archive.org/web/timemap/link/http://iana.org/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/19971210061738/http://iana.org/>; rel="first memento"; datetime="Wed, 10 Dec 1997 06:17:38 GMT", <http://web.archive.org/web/20140123034755/http://iana.org/>; rel="prev memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT", <http://web.archive.org/web/20140126093743/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT", <http://web.archive.org/web/20140129175203/http://iana.org/>; rel="next memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT", <http://web.archive.org/web/20161107170330/http://iana.org/>; rel="last memento"; datetime="Mon, 07 Nov 2016 17:03:30 GMT"'
|
||||
|
||||
'http://wayback.archive-it.org/all/{url}': '<http://iana.org/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://iana.org/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20061001072232/http://iana.org/>; rel="first memento"; datetime="Sun, 01 Oct 2006 07:22:32 GMT", <http://wayback.archive-it.org/all/20131213010804/http://iana.org/>; rel="prev memento"; datetime="Fri, 13 Dec 2013 01:08:04 GMT", <http://wayback.archive-it.org/all/20140107040552/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT", <http://wayback.archive-it.org/all/20140328213203/http://iana.org/>; rel="next memento"; datetime="Fri, 28 Mar 2014 21:32:03 GMT", <http://wayback.archive-it.org/all/20161106014705/http://iana.org/>; rel="last memento"; datetime="Sun, 06 Nov 2016 01:47:05 GMT"'
|
||||
|
||||
agg_test_2:
|
||||
'http://web.archive.org/web/{url}': '<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100510233601/http://example.com/>; rel="prev memento"; datetime="Mon, 10 May 2010 23:36:01 GMT", <http://web.archive.org/web/20100513224108/http://example.com/>; rel="memento"; datetime="Thu, 13 May 2010 22:41:08 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20161108144631/http://example.com/>; rel="last memento"; datetime="Tue, 08 Nov 2016 14:46:31 GMT"'
|
||||
|
||||
'http://www.webarchive.org.uk/wayback/archive/{url}': '<http://example.com/>; rel="original", <//www.webarchive.org.uk/wayback/archive/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <//www.webarchive.org.uk/wayback/archive/20100513010014/http://example.com/>; rel="last memento"; datetime="Thu, 13 May 2010 01:00:14 GMT", <//www.webarchive.org.uk/wayback/archive/20080410125703/http://example.com/>; rel="first memento"; datetime="Thu, 10 Apr 2008 12:57:03 GMT", <//www.webarchive.org.uk/wayback/archive/20100512204410/http://example.com/>; rel="prev memento"; datetime="Wed, 12 May 2010 20:44:10 GMT"'
|
||||
|
||||
'http://wayback.archive-it.org/all/{url}': '<http://example.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20091008222031/http://example.com/>; rel="first memento"; datetime="Thu, 08 Oct 2009 22:20:31 GMT", <http://wayback.archive-it.org/all/20100427185525/http://example.com/>; rel="prev memento"; datetime="Tue, 27 Apr 2010 18:55:25 GMT", <http://wayback.archive-it.org/all/20100511201151/http://example.com/>; rel="memento"; datetime="Tue, 11 May 2010 20:11:51 GMT", <http://wayback.archive-it.org/all/20100514231857/http://example.com/>; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://wayback.archive-it.org/all/20161108080653/http://example.com/>; rel="last memento"; datetime="Tue, 08 Nov 2016 08:06:53 GMT"'
|
||||
|
||||
|
||||
agg_test_3:
|
||||
'http://web.archive.org/web/{url}': '<http://vvork.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020727091331/http://vvork.com/>; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", <http://web.archive.org/web/20140806161228/http://vvork.com/>; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", <http://web.archive.org/web/20141018133107/http://vvork.com/>; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", <http://web.archive.org/web/20141020161243/http://vvork.com/>; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", <http://web.archive.org/web/20161027001353/http://vvork.com/>; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'
|
||||
|
||||
'http://www.webarchive.org.uk/wayback/archive/{url}': '<http://vvork.com/>; rel="original", <//www.webarchive.org.uk/wayback/archive/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <//www.webarchive.org.uk/wayback/archive/20100728221701/http://vvork.com/>; rel="last memento"; datetime="Wed, 28 Jul 2010 22:17:01 GMT", <//www.webarchive.org.uk/wayback/archive/20100124041439/http://vvork.com/>; rel="prev first memento"; datetime="Sun, 24 Jan 2010 04:14:39 GMT"'
|
||||
|
||||
'http://wayback.archive-it.org/all/{url}': '<http://vvork.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20131004231540/http://vvork.com/>; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", <http://wayback.archive-it.org/all/20090710005710/http://vvork.com/>; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", <http://wayback.archive-it.org/all/20131004175706/http://vvork.com/>; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
|
||||
|
||||
|
||||
'http://webenact.rhizome.org/vvork/{url}': '<http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", <http://www.vvork.com/>; rel="original", <http://webenact.rhizome.org/vvork/timemap/*/http://www.vvork.com/>; rel="timemap"; type="application/link-format"'
|
||||
|
||||
|
||||
agg_test_4:
|
||||
'http://wayback.archive-it.org/all/{url}': '<http://vvork.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20131004231540/http://vvork.com/>; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", <http://wayback.archive-it.org/all/20090710005710/http://vvork.com/>; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", <http://wayback.archive-it.org/all/20131004175706/http://vvork.com/>; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
|
||||
|
||||
'http://webenact.rhizome.org/vvork/{url}': '<http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", <http://www.vvork.com/>; rel="original", <http://webenact.rhizome.org/vvork/timemap/*/http://www.vvork.com/>; rel="timemap"; type="application/link-format"'
|
||||
|
||||
|
5
setup.py
5
setup.py
@ -58,7 +58,10 @@ setup(
|
||||
'pywb.manager',
|
||||
'pywb.perms',
|
||||
'pywb.webapp',
|
||||
'pywb.apps'
|
||||
'pywb.apps',
|
||||
'pywb.webagg',
|
||||
'pywb.recorder',
|
||||
'pywb.urlrewrite'
|
||||
],
|
||||
package_data={
|
||||
'pywb': ['static/flowplayer/*', 'static/*.*', 'templates/*', '*.yaml'],
|
||||
|
BIN
testdata/dupes.warc.gz
vendored
BIN
testdata/dupes.warc.gz
vendored
Binary file not shown.
BIN
testdata/example-url-agnostic-orig.warc.gz
vendored
BIN
testdata/example-url-agnostic-orig.warc.gz
vendored
Binary file not shown.
BIN
testdata/example-url-agnostic-revisit.warc.gz
vendored
BIN
testdata/example-url-agnostic-revisit.warc.gz
vendored
Binary file not shown.
BIN
testdata/iana.warc.gz
vendored
BIN
testdata/iana.warc.gz
vendored
Binary file not shown.
BIN
testdata/post-test.warc.gz
vendored
BIN
testdata/post-test.warc.gz
vendored
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user