1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: fix pywb.webagg package paths

all webagg tests working!
move testdata cdxj into sample_archive, remove rest (duplicates) #200
This commit is contained in:
Ilya Kreymer 2016-11-08 14:30:09 -08:00
parent 99e5008ac0
commit 6b4b038471
30 changed files with 176 additions and 100 deletions

View File

@ -1,4 +1,4 @@
__version__ = '0.33.0'
__version__ = '0.50.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'

View File

@ -15,10 +15,10 @@ from heapq import merge
from collections import deque
from itertools import chain
from webagg.indexsource import FileIndexSource, RedisIndexSource
from pywb.webagg.indexsource import FileIndexSource, RedisIndexSource
from pywb.utils.wbexception import NotFoundException, WbException
from webagg.utils import ParamFormatter, res_template
from pywb.webagg.utils import ParamFormatter, res_template
import six
import glob

View File

@ -1,4 +1,4 @@
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from werkzeug.routing import Map, Rule
import requests

View File

@ -1,5 +1,5 @@
from webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
from webagg.utils import MementoUtils
from pywb.webagg.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
from pywb.webagg.utils import MementoUtils
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException

View File

@ -8,11 +8,10 @@ from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
#from webagg.liverec import patched_requests as requests
import requests
from webagg.utils import ParamFormatter, res_template
from webagg.utils import MementoUtils
from pywb.webagg.utils import ParamFormatter, res_template
from pywb.webagg.utils import MementoUtils
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'

View File

@ -1,8 +1,8 @@
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.wbexception import NotFoundException
from webagg.indexsource import BaseIndexSource, RemoteIndexSource
from webagg.responseloader import LiveWebLoader
from webagg.utils import ParamFormatter, res_template
from pywb.webagg.indexsource import BaseIndexSource, RemoteIndexSource
from pywb.webagg.responseloader import LiveWebLoader
from pywb.webagg.utils import ParamFormatter, res_template
from pywb.utils.timeutils import timestamp_now

View File

@ -1,6 +1,6 @@
from webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
from webagg.utils import ParamFormatter
from webagg.indexsource import RedisIndexSource
from pywb.webagg.utils import MementoUtils, StreamIter, chunk_encode_iter
from pywb.webagg.utils import ParamFormatter
from pywb.webagg.indexsource import RedisIndexSource
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_timestamp
from pywb.utils.timeutils import iso_date_to_datetime, datetime_to_iso_date

View File

@ -1,10 +1,10 @@
from gevent.monkey import patch_all; patch_all()
from webagg.test.testutils import LiveServerTests
from webagg.handlers import DefaultResourceHandler
from webagg.app import ResAggApp
from webagg.indexsource import LiveIndexSource, RedisIndexSource
from webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
from pywb.webagg.test.testutils import LiveServerTests
from pywb.webagg.handlers import DefaultResourceHandler
from pywb.webagg.app import ResAggApp
from pywb.webagg.indexsource import LiveIndexSource, RedisIndexSource
from pywb.webagg.aggregator import SimpleAggregator, CacheDirectoryIndexSource
def simpleapp():
app = ResAggApp(debug=True)

View File

@ -3,15 +3,15 @@ import os
import shutil
import json
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass
from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass, TEST_CDX_PATH
from mock import patch
import time
from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
from webagg.aggregator import SimpleAggregator
from webagg.indexsource import MementoIndexSource
from pywb.webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource
from pywb.webagg.aggregator import SimpleAggregator
from pywb.webagg.indexsource import MementoIndexSource
#=============================================================================
@ -39,9 +39,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
dir_prefix = to_path(cls.root_dir)
dir_path ='colls/{coll}/indexes'
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
shutil.copy(to_path(TEST_CDX_PATH + 'dupes.cdxj'), coll_C)
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
fh.write('foo')
@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -108,13 +108,13 @@ class TestDirAgg(TempDirTests, BaseTestClass):
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento(self):
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': self.dir_loader}
@ -128,7 +128,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
{'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@ -156,7 +156,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
@ -166,7 +166,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
@ -177,7 +177,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {'example.cdxj': 'file'}}
exp = {'sources': {'example2.cdxj': 'file'}}
assert(res == exp)
@ -193,7 +193,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}

View File

@ -2,14 +2,14 @@
from collections import OrderedDict
from webagg.handlers import DefaultResourceHandler, HandlerSeq
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from webagg.aggregator import DirectoryIndexSource
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.webagg.aggregator import DirectoryIndexSource
from webagg.app import ResAggApp
from webagg.utils import MementoUtils
from pywb.webagg.app import ResAggApp
from pywb.webagg.utils import MementoUtils
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
@ -19,12 +19,12 @@ from six.moves.urllib.parse import urlencode
import webtest
from fakeredis import FakeStrictRedis
from .testutils import to_path, FakeRedisTests, BaseTestClass
from .testutils import to_path, FakeRedisTests, BaseTestClass, TEST_CDX_PATH, TEST_WARC_PATH
import json
sources = {
'local': DirectoryIndexSource(to_path('testdata/'), ''),
'local': DirectoryIndexSource(TEST_CDX_PATH),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
'live': LiveIndexSource(),
@ -41,15 +41,15 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
app.add_route('/live', live_handler)
source1 = GeventTimeoutAggregator(sources)
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
app.add_route('/many', handler1)
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
app.add_route('/posttest', handler2)
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
source3 = SimpleAggregator({'example': FileIndexSource(TEST_CDX_PATH + 'example2.cdxj')})
handler3 = DefaultResourceHandler(source3, TEST_WARC_PATH)
app.add_route('/fallback', HandlerSeq([handler3,
handler2,
@ -63,7 +63,7 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
app.add_route('/empty', HandlerSeq([]))
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(TEST_CDX_PATH + 'url-agnost-example.cdxj')})
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
cls.testapp = webtest.TestApp(app)
@ -329,7 +329,7 @@ foo=bar&test=abc"""
def test_redis_warc_1(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
f.hset('test:warc', 'example2.warc.gz', TEST_WARC_PATH + 'example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
@ -337,8 +337,8 @@ foo=bar&test=abc"""
def test_url_agnost(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-revisit.warc.gz')
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', TEST_WARC_PATH + 'example-url-agnostic-orig.warc.gz')
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')
@ -390,22 +390,22 @@ host: www.youtube.com\
def test_error_redis_file_not_found(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
f.hset('test:warc', 'example2.warc.gz', './x-no-such-dir/example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
assert resp.json['message'] == "example2.warc.gz: [Errno 2] No such file or directory: './x-no-such-dir/example2.warc.gz'"
f.hdel('test:warc', 'example.warc.gz')
f.hdel('test:warc', 'example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
f.delete('test:warc')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
assert resp.json == {'message': 'example2.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example2.warc.gz: Archive File Not Found'}}
def test_error_fallback_live_not_found(self):

View File

@ -1,14 +1,14 @@
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
from webagg.indexsource import LiveIndexSource
from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
from pywb.webagg.indexsource import LiveIndexSource
from webagg.aggregator import SimpleAggregator
from pywb.webagg.aggregator import SimpleAggregator
from pywb.utils.timeutils import timestamp_now
from .testutils import key_ts_res
from .testutils import key_ts_res, TEST_CDX_PATH
import pytest
import os
from fakeredis import FakeStrictRedis
from mock import patch
@ -19,7 +19,7 @@ redismock.start()
def setup_module():
r = FakeStrictRedis.from_url('redis://localhost:6379/2')
r.delete('test:rediscdx')
with open('testdata/iana.cdxj', 'rb') as fh:
with open(TEST_CDX_PATH + 'iana.cdxj', 'rb') as fh:
for line in fh:
r.zadd('test:rediscdx', 0, line.rstrip())
@ -29,7 +29,7 @@ def teardown_module():
local_sources = [
FileIndexSource('testdata/iana.cdxj'),
FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
RedisIndexSource('redis://localhost:6379/2/test:rediscdx')
]

View File

@ -1,4 +1,4 @@
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from pywb.webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import Bottle, request, response, debug
import webtest
import traceback

View File

@ -1,28 +1,34 @@
from gevent import monkey; monkey.patch_all(thread=False)
from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from webagg.aggregator import BaseAggregator
from pywb.webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from pywb.webagg.aggregator import BaseAggregator
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import to_json_list, to_path
from pywb.webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import to_json_list, to_path, TEST_CDX_PATH
import json
import pytest
import time
import six
import yaml
from webagg.handlers import IndexHandler
from mock import patch
from pywb.webagg.handlers import IndexHandler
from pywb import get_test_dir
from pywb.utils.wbexception import NotFoundException
# Aggregator Mappings
sources = {
'local': FileIndexSource(to_path('testdata/iana.cdxj')),
'local': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
}
aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}
@ -34,13 +40,41 @@ agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
}
# Load expected link headers
link_header_data = None
def setup_module():
global link_header_data
with open(to_path(get_test_dir() + '/text_content/link_headers.yaml')) as fh:
link_header_data = yaml.load(fh)
orig_get_timegate_links = MementoIndexSource.get_timegate_links
def mock_link_header(test_name, load=False):
def mock_func(self, params, closest):
if load:
res = orig_get_timegate_links(self, params, closest)
print("'{0}': '{1}'".format(self.timegate_url, res))
return res
try:
res = link_header_data[test_name][self.timegate_url]
time.sleep(0.2)
except:
msg = self.timegate_url.format(url=params['url'])
raise NotFoundException(msg)
return res
return mock_func
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_1'))
def test_mem_agg_index_1(agg):
url = 'http://iana.org/'
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
@ -53,23 +87,25 @@ def test_mem_agg_index_1(agg):
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_2'))
def test_mem_agg_index_2(agg):
url = 'http://example.com/'
res, errs = agg(dict(url=url, closest='20100512', limit=6))
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
#{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
{"timestamp": "20100513224108", "load_url": "http://web.archive.org/web/20100513224108id_/http://example.com/", "source": "ia"},
{"timestamp": "20100511201151", 'load_url': "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
{"timestamp": "20100510233601", "load_url": "http://web.archive.org/web/20100510233601id_/http://example.com/", "source": "ia"}]
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"},
]
assert(to_json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_3'))
def test_mem_agg_index_3(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=5))
@ -85,6 +121,7 @@ def test_mem_agg_index_3(agg):
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header('agg_test_4'))
def test_mem_agg_index_4(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))

View File

@ -1,13 +1,13 @@
from webagg.aggregator import RedisMultiKeyIndexSource
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass
from pywb.webagg.aggregator import RedisMultiKeyIndexSource
from .testutils import to_path, to_json_list, FakeRedisTests, BaseTestClass, TEST_CDX_PATH
class TestRedisAgg(FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRedisAgg, cls).setup_class()
cls.add_cdx_to_redis(to_path('testdata/example.cdxj'), 'FOO:example:cdxj')
cls.add_cdx_to_redis(to_path('testdata/dupes.cdxj'), 'FOO:dupes:cdxj')
cls.add_cdx_to_redis(TEST_CDX_PATH + 'example2.cdxj', 'FOO:example:cdxj')
cls.add_cdx_to_redis(TEST_CDX_PATH + 'dupes.cdxj', 'FOO:dupes:cdxj')
cls.indexloader = RedisMultiKeyIndexSource('redis://localhost/2/{user}:{coll}:cdxj')
@ -17,7 +17,7 @@ class TestRedisAgg(FakeRedisTests, BaseTestClass):
exp = [
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:dupes:cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
{'source': 'FOO:example:cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(errs == {})

View File

@ -1,11 +1,11 @@
from gevent import monkey; monkey.patch_all(thread=False)
import time
from webagg.indexsource import FileIndexSource
from pywb.webagg.indexsource import FileIndexSource
from webagg.aggregator import SimpleAggregator, TimeoutMixin
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
from pywb.webagg.aggregator import SimpleAggregator, TimeoutMixin
from pywb.webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
from .testutils import to_json_list
from .testutils import to_json_list, TEST_CDX_PATH
class TimeoutFileSource(FileIndexSource):
@ -26,8 +26,8 @@ TimeoutAggregator = GeventTimeoutAggregator
def setup_module():
global sources
sources = {'slow': TimeoutFileSource('testdata/example.cdxj', 0.2),
'slower': TimeoutFileSource('testdata/dupes.cdxj', 0.5)
sources = {'slow': TimeoutFileSource(TEST_CDX_PATH + 'example2.cdxj', 0.2),
'slower': TimeoutFileSource(TEST_CDX_PATH + 'dupes.cdxj', 0.5)
}

View File

@ -1,12 +1,12 @@
import webtest
from io import BytesIO
from webagg.app import ResAggApp
from pywb.webagg.app import ResAggApp
import requests
from webagg.handlers import DefaultResourceHandler
from webagg.aggregator import SimpleAggregator
from webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.webagg.handlers import DefaultResourceHandler
from pywb.webagg.aggregator import SimpleAggregator
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
from pywb.warc.recordloader import ArcWarcRecordLoader

View File

@ -10,11 +10,12 @@ from mock import patch
from wsgiref.simple_server import make_server
from webagg.aggregator import SimpleAggregator
from webagg.app import ResAggApp
from webagg.handlers import DefaultResourceHandler
from webagg.indexsource import LiveIndexSource
from pywb.webagg.aggregator import SimpleAggregator
from pywb.webagg.app import ResAggApp
from pywb.webagg.handlers import DefaultResourceHandler
from pywb.webagg.indexsource import LiveIndexSource
from pywb import get_test_dir
# ============================================================================
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
@ -30,6 +31,11 @@ def to_path(path):
return path
# ============================================================================
TEST_CDX_PATH = to_path(get_test_dir() + '/cdxj/')
TEST_WARC_PATH = to_path(get_test_dir() + '/warcs/')
# ============================================================================
class BaseTestClass(object):
@classmethod

View File

@ -1 +1 @@
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example.warc.gz"}
com,example)/ 20160225042329 {"url": "http://example.com/", "mime": "text/html", "status": "200", "digest": "37cf167c2672a4a64af901d9484e75eee0e2c98a", "length": "1286", "offset": "363", "filename": "example2.warc.gz"}

View File

@ -0,0 +1,31 @@
agg_test_1:
'http://web.archive.org/web/{url}': '<http://iana.org/>; rel="original", <http://web.archive.org/web/timemap/link/http://iana.org/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/19971210061738/http://iana.org/>; rel="first memento"; datetime="Wed, 10 Dec 1997 06:17:38 GMT", <http://web.archive.org/web/20140123034755/http://iana.org/>; rel="prev memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT", <http://web.archive.org/web/20140126093743/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT", <http://web.archive.org/web/20140129175203/http://iana.org/>; rel="next memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT", <http://web.archive.org/web/20161107170330/http://iana.org/>; rel="last memento"; datetime="Mon, 07 Nov 2016 17:03:30 GMT"'
'http://wayback.archive-it.org/all/{url}': '<http://iana.org/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://iana.org/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20061001072232/http://iana.org/>; rel="first memento"; datetime="Sun, 01 Oct 2006 07:22:32 GMT", <http://wayback.archive-it.org/all/20131213010804/http://iana.org/>; rel="prev memento"; datetime="Fri, 13 Dec 2013 01:08:04 GMT", <http://wayback.archive-it.org/all/20140107040552/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT", <http://wayback.archive-it.org/all/20140328213203/http://iana.org/>; rel="next memento"; datetime="Fri, 28 Mar 2014 21:32:03 GMT", <http://wayback.archive-it.org/all/20161106014705/http://iana.org/>; rel="last memento"; datetime="Sun, 06 Nov 2016 01:47:05 GMT"'
agg_test_2:
'http://web.archive.org/web/{url}': '<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100510233601/http://example.com/>; rel="prev memento"; datetime="Mon, 10 May 2010 23:36:01 GMT", <http://web.archive.org/web/20100513224108/http://example.com/>; rel="memento"; datetime="Thu, 13 May 2010 22:41:08 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20161108144631/http://example.com/>; rel="last memento"; datetime="Tue, 08 Nov 2016 14:46:31 GMT"'
'http://www.webarchive.org.uk/wayback/archive/{url}': '<http://example.com/>; rel="original", <//www.webarchive.org.uk/wayback/archive/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <//www.webarchive.org.uk/wayback/archive/20100513010014/http://example.com/>; rel="last memento"; datetime="Thu, 13 May 2010 01:00:14 GMT", <//www.webarchive.org.uk/wayback/archive/20080410125703/http://example.com/>; rel="first memento"; datetime="Thu, 10 Apr 2008 12:57:03 GMT", <//www.webarchive.org.uk/wayback/archive/20100512204410/http://example.com/>; rel="prev memento"; datetime="Wed, 12 May 2010 20:44:10 GMT"'
'http://wayback.archive-it.org/all/{url}': '<http://example.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20091008222031/http://example.com/>; rel="first memento"; datetime="Thu, 08 Oct 2009 22:20:31 GMT", <http://wayback.archive-it.org/all/20100427185525/http://example.com/>; rel="prev memento"; datetime="Tue, 27 Apr 2010 18:55:25 GMT", <http://wayback.archive-it.org/all/20100511201151/http://example.com/>; rel="memento"; datetime="Tue, 11 May 2010 20:11:51 GMT", <http://wayback.archive-it.org/all/20100514231857/http://example.com/>; rel="next memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://wayback.archive-it.org/all/20161108080653/http://example.com/>; rel="last memento"; datetime="Tue, 08 Nov 2016 08:06:53 GMT"'
agg_test_3:
'http://web.archive.org/web/{url}': '<http://vvork.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020727091331/http://vvork.com/>; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", <http://web.archive.org/web/20140806161228/http://vvork.com/>; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", <http://web.archive.org/web/20141018133107/http://vvork.com/>; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", <http://web.archive.org/web/20141020161243/http://vvork.com/>; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", <http://web.archive.org/web/20161027001353/http://vvork.com/>; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'
'http://www.webarchive.org.uk/wayback/archive/{url}': '<http://vvork.com/>; rel="original", <//www.webarchive.org.uk/wayback/archive/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <//www.webarchive.org.uk/wayback/archive/20100728221701/http://vvork.com/>; rel="last memento"; datetime="Wed, 28 Jul 2010 22:17:01 GMT", <//www.webarchive.org.uk/wayback/archive/20100124041439/http://vvork.com/>; rel="prev first memento"; datetime="Sun, 24 Jan 2010 04:14:39 GMT"'
'http://wayback.archive-it.org/all/{url}': '<http://vvork.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20131004231540/http://vvork.com/>; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", <http://wayback.archive-it.org/all/20090710005710/http://vvork.com/>; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", <http://wayback.archive-it.org/all/20131004175706/http://vvork.com/>; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
'http://webenact.rhizome.org/vvork/{url}': '<http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", <http://www.vvork.com/>; rel="original", <http://webenact.rhizome.org/vvork/timemap/*/http://www.vvork.com/>; rel="timemap"; type="application/link-format"'
agg_test_4:
'http://wayback.archive-it.org/all/{url}': '<http://vvork.com/>; rel="original", <http://wayback.archive-it.org/all/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://wayback.archive-it.org/all/20131004231540/http://vvork.com/>; rel="last memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT", <http://wayback.archive-it.org/all/20090710005710/http://vvork.com/>; rel="first memento"; datetime="Fri, 10 Jul 2009 00:57:10 GMT", <http://wayback.archive-it.org/all/20131004175706/http://vvork.com/>; rel="prev memento"; datetime="Fri, 04 Oct 2013 17:57:06 GMT"'
'http://webenact.rhizome.org/vvork/{url}': '<http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", <http://www.vvork.com/>; rel="original", <http://webenact.rhizome.org/vvork/timemap/*/http://www.vvork.com/>; rel="timemap"; type="application/link-format"'

View File

@ -58,7 +58,10 @@ setup(
'pywb.manager',
'pywb.perms',
'pywb.webapp',
'pywb.apps'
'pywb.apps',
'pywb.webagg',
'pywb.recorder',
'pywb.urlrewrite'
],
package_data={
'pywb': ['static/flowplayer/*', 'static/*.*', 'templates/*', '*.yaml'],

BIN
testdata/dupes.warc.gz vendored

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
testdata/iana.warc.gz vendored

Binary file not shown.

Binary file not shown.