1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

test coverage pass:

refactor and cleanup to improve coverage for corner cases
This commit is contained in:
Ilya Kreymer 2014-04-02 13:16:54 -07:00
parent 8d3d326c9e
commit 91184426b7
12 changed files with 126 additions and 86 deletions

View File

@ -1,6 +1,6 @@
try:
try: # pragma: no cover
from collections import OrderedDict
except ImportError:
except ImportError: # pragma: no cover
from ordereddict import OrderedDict
import itertools

View File

@ -6,9 +6,9 @@ org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3
# test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
org,iana)/dnssec 20140126201307 zipnum 8511 373
org,iana)/domains/int 20140126201239 zipnum 8884 353
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz

View File

@ -109,7 +109,7 @@ class ReplayView:
response = None
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
if self.content_rewriter and not wbrequest.is_identity:
response = self.rewrite_content(wbrequest,
cdx,
@ -182,7 +182,7 @@ class ReplayView:
(status_headers, response_gen) = result
if self.buffer_response:
if wbrequest.wb_url.mod == 'id_':
if wbrequest.is_identity:
status_headers.remove_header('content-length')
response_gen = self.buffered_response(status_headers, response_gen)
@ -244,7 +244,7 @@ class ReplayView:
# skip all 304s
if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.mod == 'id_'):
not wbrequest.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx))
@ -298,16 +298,18 @@ class ReplayView:
>>> ReplayView.strip_scheme('about://example.com') ==\
ReplayView.strip_scheme('example.com')
True
>>> ReplayView.strip_scheme('http://') ==\
ReplayView.strip_scheme('')
True
>>> ReplayView.strip_scheme('#!@?') ==\
ReplayView.strip_scheme('#!@?')
True
"""
m = ReplayView.STRIP_SCHEME.match(url)
if not m:
return url
match = m.group(2)
if match:
return match
else:
return url
return match
if __name__ == "__main__":

View File

@ -10,6 +10,10 @@
>>> print_req_from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# ajax
>>> print_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> print_req_from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}

View File

@ -100,14 +100,12 @@ class WbRequest(object):
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value:
return False
if value.lower() == 'xmlhttprequest':
if value and value.lower() == 'xmlhttprequest':
return True
if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
return True
#if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
# return True
return False
def __repr__(self):

View File

@ -65,9 +65,6 @@ class WSGIApp(object):
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg)
# except InternalRedirect as ir:
# return ir.response
except WbException as e:
response = handle_exception(env, wb_router, e, False)
@ -115,19 +112,18 @@ def init_app(init_func, load_yaml=True, config_file=None):
level=logging.DEBUG)
logging.debug('')
if load_yaml:
# env setting overrides all others
env_config = os.environ.get('PYWB_CONFIG_FILE')
if env_config:
config_file = env_config
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
try:
if load_yaml:
# env setting overrides all others
env_config = os.environ.get('PYWB_CONFIG_FILE')
if env_config:
config_file = env_config
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
wb_router = init_func(config)
else:
wb_router = init_func()

View File

@ -68,6 +68,10 @@ Exception: Bad Request Url: http://#$%#/
Traceback (most recent call last):
Exception: Bad Request Url: http://example.com:abc/
>>> x = WbUrl('')
Traceback (most recent call last):
Exception: ('Invalid WbUrl: ', '')
# considered blank
>>> x = WbUrl('https:/')
>>> x = WbUrl('https:///')

View File

@ -79,8 +79,8 @@ class BaseWbUrl(object):
class WbUrl(BaseWbUrl):
# Regexs
# ======================
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.+)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
DEFAULT_SCHEME = 'http://'
# ======================
@ -90,11 +90,9 @@ class WbUrl(BaseWbUrl):
self.original_url = url
if not any(f(url) for f in [self._init_query, self._init_replay]):
raise Exception('Invalid WbUrl: ', url)
if len(self.url) == 0:
raise Exception('Invalid WbUrl: ', url)
if not self._init_query(url):
if not self._init_replay(url):
raise Exception('Invalid WbUrl: ', url)
# protocol agnostic url -> http://
# no protocol -> http://

View File

@ -21,9 +21,12 @@ ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
#=================================================================
class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
#self.filename = filename
#self.reason = reason
if filename:
msg = filename + ':' + str(reason)
else:
msg = str(reason)
super(ArchiveLoadFailed, self).__init__(msg)
def status(self):
return '503 Service Unavailable'

View File

@ -233,6 +233,19 @@ Exception: ArchiveLoadFailed
# Invalid WARC
>>> parse_stream_error(stream=None, statusline='ABC', known_format='warc')
Exception: ArchiveLoadFailed
# Revisit Errors
# original not found
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz - - -', reraise=True)
Traceback (most recent call last):
ArchiveLoadFailed: Missing Revisit Original
# no revisit func available
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX, revisit_func=None, reraise=True)
Traceback (most recent call last):
ArchiveLoadFailed: Original for revisit could not be loaded
"""
import os
@ -281,16 +294,19 @@ def load_orig_cdx(self):
#==============================================================================
def load_from_cdx_test(cdx):
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False):
resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx)
try:
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx)
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, revisit_func)
print headers
sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline())
except ArchiveLoadFailed as e:
print 'Exception: ' + e.__class__.__name__
if reraise:
raise
else:
print 'Exception: ' + e.__class__.__name__
#==============================================================================

View File

@ -1,38 +1,38 @@
com,example)/ 20140127171200 zipnum 0 276
org,iana)/ 20140127171238 zipnum 276 328
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410
org,iana)/dnssec 20140126201307 zipnum 8511 373
org,iana)/domains/int 20140126201239 zipnum 8884 353
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386
org,iana)/time-zones 20140126200737 zipnum 9623 145
com,example)/ 20140127171200 zipnum 0 276 1
org,iana)/ 20140127171238 zipnum 276 328 2
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 5
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 6
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 7
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 8
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 9
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 10
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 11
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 12
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 13
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 14
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 15
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 16
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 17
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 18
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 19
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 20
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 21
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 22
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 23
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 24
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 25
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 26
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 27
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 28
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 29
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 30
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 31
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 32
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 33
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 34
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
org,iana)/time-zones 20140126200737 zipnum 9623 145 38

View File

@ -1,8 +1,10 @@
from pytest import raises
import webtest
from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
class TestWb:
TEST_CONFIG = 'tests/test_config.yaml'
@ -38,6 +40,16 @@ class TestWb:
self._assert_basic_html(resp)
assert 'Search' in resp.body
def test_pywb_root_head(self):
resp = self.testapp.head('/pywb/')
assert resp.content_type == 'text/html'
assert resp.status_int == 200
def test_pywb_invalid_path(self):
resp = self.testapp.head('/blah/', status=404)
assert resp.content_type == 'text/html'
assert resp.status_int == 404
def test_calendar_query(self):
resp = self.testapp.get('/pywb/*/iana.org')
self._assert_basic_html(resp)
@ -246,3 +258,10 @@ class TestWb:
assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body
def test_invalid_config(self):
with raises(IOError):
init_app(create_wb_router,
load_yaml=True,
config_file='x-invalid-x')