1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

webagg: Fix loading of url-lookup (url agnostic) revisits, ensure all params passed to cdx lookup, add tests for url-agnostic revisit lookup

This commit is contained in:
Ilya Kreymer 2016-08-04 16:53:24 -04:00
parent 20b161bf90
commit c93d7ecafc
5 changed files with 33 additions and 6 deletions

Binary file not shown.

Binary file not shown.

2
testdata/url-agnost-example.cdxj vendored Normal file
View File

@ -0,0 +1,2 @@
com,example)/ 20130729195151 {"url": "http://test@example.com/", "mime": "warc/revisit", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "591", "offset": "355", "filename": "example-url-agnostic-revisit.warc.gz"}
org,iana,example)/ 20130702195402 {"url": "http://example.iana.org/", "mime": "text/html", "status": "200", "digest": "B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", "length": "1001", "offset": "353", "filename": "example-url-agnostic-orig.warc.gz"}

View File

@ -166,10 +166,6 @@ class WARCPathLoader(BaseLoader):
self.cdx_source = cdx_source
def cdx_index_source(self, *args, **kwargs):
cdx_iter, errs = self.cdx_source(*args, **kwargs)
return cdx_iter
def _make_resolver(self, path):
if hasattr(path, '__call__'):
return path
@ -188,13 +184,26 @@ class WARCPathLoader(BaseLoader):
return None
orig_source = cdx.get('source', '').split(':')[0]
cdx._formatter = ParamFormatter(params, orig_source)
formatter = ParamFormatter(params, orig_source)
cdx._formatter = formatter
def local_index_query(local_params):
for n, v in six.iteritems(params):
if n.startswith('param.'):
local_params[n] = v
cdx_iter, errs = self.cdx_source(local_params)
for cdx in cdx_iter:
cdx._formatter = formatter
yield cdx
return cdx_iter
failed_files = []
headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx,
failed_files,
self.cdx_index_source))
local_index_query))
if cdx.get('status', '').startswith('3'):
status_headers = self.headers_parser.parse(payload.stream)

View File

@ -63,6 +63,9 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
app.add_route('/empty', HandlerSeq([]))
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
url_agnost = SimpleAggregator({'url-agnost': FileIndexSource(to_path('testdata/url-agnost-example.cdxj'))})
app.add_route('/urlagnost', DefaultResourceHandler(url_agnost, 'redis://localhost/2/test:{arg}:warc'))
cls.testapp = webtest.TestApp(app)
def _check_uri_date(self, resp, uri, dt):
@ -85,6 +88,7 @@ class TestResAgg(FakeRedisTests, BaseTestClass):
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/allredis', '/allredis/postreq',
'/urlagnost', '/urlagnost/postreq',
'/invalid', '/invalid/postreq'])
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
@ -331,6 +335,18 @@ foo=bar&test=abc"""
assert resp.headers['WebAgg-Source-Coll'] == 'example'
def test_url_agnost(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:foo:warc', 'example-url-agnostic-revisit.warc.gz', './testdata/example-url-agnostic-revisit.warc.gz')
f.hset('test:foo:warc', 'example-url-agnostic-orig.warc.gz', './testdata/example-url-agnostic-orig.warc.gz')
resp = self.testapp.get('/urlagnost/resource?url=http://example.com/&param.arg=foo')
assert resp.status_int == 200
assert resp.headers['Link'] == MementoUtils.make_link('http://test@example.com/', 'original')
assert resp.headers['WebAgg-Source-Coll'] == 'url-agnost'
assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
def test_live_video_loader(self):
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
'content_type': 'application/vnd.youtube-dl_formats+json'