2014-03-26 11:33:46 -07:00
|
|
|
from pywb.apps.cdx_server import application
|
|
|
|
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
2015-03-19 13:29:29 -07:00
|
|
|
import pywb.cdx.cdxobject as obj
|
2014-03-26 11:33:46 -07:00
|
|
|
|
2014-04-04 10:09:26 -07:00
|
|
|
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
2014-03-26 11:33:46 -07:00
|
|
|
from pywb.utils.wbexception import AccessException, NotFoundException
|
|
|
|
from pywb.utils.wbexception import BadRequestException, WbException
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
from six.moves.urllib.error import HTTPError
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
from mock import patch
|
|
|
|
from pytest import raises
|
|
|
|
import webtest
|
2016-02-18 21:26:40 -08:00
|
|
|
import unittest
|
|
|
|
|
|
|
|
import six
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
from pywb import get_test_dir
|
|
|
|
|
|
|
|
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
|
|
|
|
2014-04-02 21:26:53 -07:00
|
|
|
CDX_SERVER_URL = 'http://localhost/pywb-cdx'
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
CDX_RESULT = [
|
2015-03-19 13:29:29 -07:00
|
|
|
(obj.URLKEY, 'com,example)/'),
|
|
|
|
(obj.TIMESTAMP, '20140127171200'),
|
|
|
|
(obj.ORIGINAL, 'http://example.com'),
|
|
|
|
(obj.MIMETYPE, 'text/html'),
|
|
|
|
(obj.STATUSCODE, '200'),
|
|
|
|
(obj.DIGEST, 'B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'),
|
|
|
|
(obj.REDIRECT, '-'),
|
|
|
|
(obj.ROBOTFLAGS, '-'),
|
|
|
|
(obj.LENGTH, '1046'),
|
|
|
|
(obj.OFFSET, '334'),
|
|
|
|
(obj.FILENAME, 'dupes.warc.gz')
|
2014-03-26 11:33:46 -07:00
|
|
|
]
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
testapp = None
|
2014-03-01 16:35:27 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def setup_module(self):
|
|
|
|
global testapp
|
|
|
|
testapp = webtest.TestApp(application)
|
2014-02-27 18:43:55 -08:00
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def mock_urlopen(req):
|
|
|
|
resp = testapp.get(req.get_full_url())
|
2016-02-18 21:26:40 -08:00
|
|
|
return resp.body.split(b'\n')
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def mock_urlopen_err(err):
|
|
|
|
def make_err(req):
|
|
|
|
raise HTTPError(req.get_full_url(), err, None, None, None)
|
|
|
|
return make_err
|
|
|
|
|
|
|
|
# First time expect a 404 when called with 'exact',
|
|
|
|
# Second time expect a 200 for fuzzy match
|
|
|
|
def mock_urlopen_fuzzy(req):
|
|
|
|
status = 200
|
2016-02-22 13:39:47 -08:00
|
|
|
print(req.get_full_url())
|
2014-03-26 11:33:46 -07:00
|
|
|
if 'exact' in req.get_full_url():
|
|
|
|
status = 404
|
|
|
|
|
|
|
|
resp = testapp.get(req.get_full_url(), status=status)
|
|
|
|
|
|
|
|
if status == 200:
|
2016-02-18 21:26:40 -08:00
|
|
|
return resp.body.split(b'\n')
|
2014-03-26 11:33:46 -07:00
|
|
|
else:
|
|
|
|
raise mock_urlopen_err(404)(req)
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen)
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_cdx_match(server):
|
|
|
|
x = server.load_cdx(url='example.com',
|
|
|
|
limit=2,
|
|
|
|
output='cdxobject')
|
2016-02-18 21:26:40 -08:00
|
|
|
x = list(x)
|
|
|
|
assert(list(x[1].items()) == CDX_RESULT)
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
|
2016-02-18 21:26:40 -08:00
|
|
|
with patch('pywb.cdx.cdxsource.urlopen', mock):
|
2014-03-26 11:33:46 -07:00
|
|
|
x = server.load_cdx(url='http://example.com?_=123',
|
|
|
|
limit=2,
|
|
|
|
output='cdxobject',
|
|
|
|
allowFuzzy=True)
|
2016-02-18 21:26:40 -08:00
|
|
|
x = list(x)
|
|
|
|
assert(list(x[1].items()) == CDX_RESULT)
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(404))
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_404(server):
|
|
|
|
server.load_cdx(url='http://notfound.example.com')
|
|
|
|
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(403))
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_403(server):
|
|
|
|
server.load_cdx(url='http://notfound.example.com')
|
|
|
|
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(400))
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_400(server):
|
|
|
|
server.load_cdx(url='http://notfound.example.com')
|
|
|
|
|
|
|
|
|
2016-02-18 21:26:40 -08:00
|
|
|
@patch('pywb.cdx.cdxsource.urlopen', mock_urlopen_err(502))
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_502(server):
|
|
|
|
server.load_cdx(url='http://notfound.example.com')
|
|
|
|
|
|
|
|
|
|
|
|
def test_match():
|
|
|
|
# Local CDX Server
|
|
|
|
assert_cdx_match(CDXServer([TEST_CDX_DIR]))
|
|
|
|
|
|
|
|
# Remote CDX Source, Local Filtering
|
|
|
|
assert_cdx_match(CDXServer(CDX_SERVER_URL))
|
|
|
|
|
|
|
|
# Remote CDX Query (Remote Filtering)
|
|
|
|
assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL))
|
|
|
|
|
|
|
|
|
|
|
|
def test_fuzzy_match():
|
|
|
|
# Local CDX Server
|
|
|
|
assert_cdx_fuzzy_match(CDXServer([TEST_CDX_DIR],
|
2014-04-04 10:09:26 -07:00
|
|
|
ds_rules_file=DEFAULT_RULES_FILE))
|
2014-03-26 11:33:46 -07:00
|
|
|
|
|
|
|
# Remote CDX Source, Local Filtering
|
|
|
|
# two calls to remote, first exact with 404,
|
|
|
|
# then fuzzy with 200
|
|
|
|
assert_cdx_fuzzy_match(CDXServer(CDX_SERVER_URL,
|
2014-04-04 10:09:26 -07:00
|
|
|
ds_rules_file=DEFAULT_RULES_FILE),
|
2014-03-26 11:33:46 -07:00
|
|
|
mock_urlopen_fuzzy)
|
|
|
|
|
|
|
|
# Remote CDX Query (Remote Filtering)
|
|
|
|
# fuzzy match handled on remote, single response
|
|
|
|
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
|
2014-04-04 10:09:26 -07:00
|
|
|
ds_rules_file=DEFAULT_RULES_FILE))
|
2014-02-28 01:39:04 +00:00
|
|
|
|
2014-05-16 21:16:50 -07:00
|
|
|
def test_fuzzy_no_match_1():
|
|
|
|
# no match, no fuzzy
|
2016-02-18 21:26:40 -08:00
|
|
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
2014-05-16 21:16:50 -07:00
|
|
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
|
|
|
with raises(NotFoundException):
|
|
|
|
server.load_cdx(url='http://notfound.example.com/',
|
|
|
|
output='cdxobject',
|
|
|
|
reverse=True,
|
|
|
|
allowFuzzy=True)
|
|
|
|
|
|
|
|
def test_fuzzy_no_match_2():
|
|
|
|
# fuzzy rule, but no actual match
|
2016-02-18 21:26:40 -08:00
|
|
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
2014-05-16 21:16:50 -07:00
|
|
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
|
|
|
with raises(NotFoundException):
|
|
|
|
server.load_cdx(url='http://notfound.example.com/?_=1234',
|
|
|
|
closest='2014',
|
|
|
|
reverse=True,
|
|
|
|
output='cdxobject',
|
|
|
|
allowFuzzy=True)
|
|
|
|
|
|
|
|
def test2_fuzzy_no_match_3():
|
|
|
|
# special fuzzy rule, matches prefix test.example.example.,
|
|
|
|
# but doesn't match rule regex
|
2016-02-18 21:26:40 -08:00
|
|
|
with patch('pywb.cdx.cdxsource.urlopen', mock_urlopen):
|
2014-05-16 21:16:50 -07:00
|
|
|
server = CDXServer([TEST_CDX_DIR], ds_rules_file=DEFAULT_RULES_FILE)
|
|
|
|
with raises(NotFoundException):
|
|
|
|
server.load_cdx(url='http://test.example.example/',
|
|
|
|
allowFuzzy=True)
|
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def assert_error(func, exception):
|
|
|
|
with raises(exception):
|
|
|
|
func(CDXServer(CDX_SERVER_URL))
|
2014-02-28 01:39:04 +00:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
with raises(exception):
|
|
|
|
func(RemoteCDXServer(CDX_SERVER_URL))
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def test_err_404():
|
|
|
|
# Test local for consistency
|
|
|
|
with raises(NotFoundException):
|
|
|
|
assert_404(CDXServer([TEST_CDX_DIR]))
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
assert_error(assert_404, NotFoundException)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def test_err_403():
|
|
|
|
assert_error(assert_403, AccessException)
|
2014-02-28 01:39:04 +00:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def test_err_400():
|
|
|
|
assert_error(assert_400, BadRequestException)
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-26 11:33:46 -07:00
|
|
|
def test_err_502():
|
|
|
|
assert_error(assert_502, WbException)
|