1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

test coverage pass:

refactor and cleanup to improve coverage for corner cases
This commit is contained in:
Ilya Kreymer 2014-04-02 13:16:54 -07:00
parent 8d3d326c9e
commit 91184426b7
12 changed files with 126 additions and 86 deletions

View File

@ -1,6 +1,6 @@
try: try: # pragma: no cover
from collections import OrderedDict from collections import OrderedDict
except ImportError: except ImportError: # pragma: no cover
from ordereddict import OrderedDict from ordereddict import OrderedDict
import itertools import itertools

View File

@ -6,9 +6,9 @@ org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3
# test idx index (tabs replacad with 4 spaces) # test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) >>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
org,iana)/dnssec 20140126201307 zipnum 8511 373 org,iana)/dnssec 20140126201307 zipnum 8511 373 35
org,iana)/domains/int 20140126201239 zipnum 8884 353 org,iana)/domains/int 20140126201239 zipnum 8884 353 36
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') >>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz

View File

@ -109,7 +109,7 @@ class ReplayView:
response = None response = None
if self.content_rewriter and wbrequest.wb_url.mod != 'id_': if self.content_rewriter and not wbrequest.is_identity:
response = self.rewrite_content(wbrequest, response = self.rewrite_content(wbrequest,
cdx, cdx,
@ -182,7 +182,7 @@ class ReplayView:
(status_headers, response_gen) = result (status_headers, response_gen) = result
if self.buffer_response: if self.buffer_response:
if wbrequest.wb_url.mod == 'id_': if wbrequest.is_identity:
status_headers.remove_header('content-length') status_headers.remove_header('content-length')
response_gen = self.buffered_response(status_headers, response_gen) response_gen = self.buffered_response(status_headers, response_gen)
@ -244,7 +244,7 @@ class ReplayView:
# skip all 304s # skip all 304s
if (status_headers.statusline.startswith('304') and if (status_headers.statusline.startswith('304') and
not wbrequest.wb_url.mod == 'id_'): not wbrequest.is_identity):
raise CaptureException('Skipping 304 Modified: ' + str(cdx)) raise CaptureException('Skipping 304 Modified: ' + str(cdx))
@ -298,16 +298,18 @@ class ReplayView:
>>> ReplayView.strip_scheme('about://example.com') ==\ >>> ReplayView.strip_scheme('about://example.com') ==\
ReplayView.strip_scheme('example.com') ReplayView.strip_scheme('example.com')
True True
>>> ReplayView.strip_scheme('http://') ==\
ReplayView.strip_scheme('')
True
>>> ReplayView.strip_scheme('#!@?') ==\
ReplayView.strip_scheme('#!@?')
True
""" """
m = ReplayView.STRIP_SCHEME.match(url) m = ReplayView.STRIP_SCHEME.match(url)
if not m:
return url
match = m.group(2) match = m.group(2)
if match: return match
return match
else:
return url
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -10,6 +10,10 @@
>>> print_req_from_uri('/2010/example.com') >>> print_req_from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
# ajax
>>> print_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'})
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> print_req_from_uri('../example.com') >>> print_req_from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}

View File

@ -100,14 +100,12 @@ class WbRequest(object):
def _is_ajax(self): def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH') value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value: if value and value.lower() == 'xmlhttprequest':
return False
if value.lower() == 'xmlhttprequest':
return True return True
if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')): #if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
return True # return True
return False return False
def __repr__(self): def __repr__(self):

View File

@ -65,9 +65,6 @@ class WSGIApp(object):
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg) raise NotFoundException(msg)
# except InternalRedirect as ir:
# return ir.response
except WbException as e: except WbException as e:
response = handle_exception(env, wb_router, e, False) response = handle_exception(env, wb_router, e, False)
@ -115,19 +112,18 @@ def init_app(init_func, load_yaml=True, config_file=None):
level=logging.DEBUG) level=logging.DEBUG)
logging.debug('') logging.debug('')
if load_yaml:
# env setting overrides all others
env_config = os.environ.get('PYWB_CONFIG_FILE')
if env_config:
config_file = env_config
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
try: try:
if load_yaml: if load_yaml:
# env setting overrides all others
env_config = os.environ.get('PYWB_CONFIG_FILE')
if env_config:
config_file = env_config
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
wb_router = init_func(config) wb_router = init_func(config)
else: else:
wb_router = init_func() wb_router = init_func()

View File

@ -68,6 +68,10 @@ Exception: Bad Request Url: http://#$%#/
Traceback (most recent call last): Traceback (most recent call last):
Exception: Bad Request Url: http://example.com:abc/ Exception: Bad Request Url: http://example.com:abc/
>>> x = WbUrl('')
Traceback (most recent call last):
Exception: ('Invalid WbUrl: ', '')
# considered blank # considered blank
>>> x = WbUrl('https:/') >>> x = WbUrl('https:/')
>>> x = WbUrl('https:///') >>> x = WbUrl('https:///')

View File

@ -79,8 +79,8 @@ class BaseWbUrl(object):
class WbUrl(BaseWbUrl): class WbUrl(BaseWbUrl):
# Regexs # Regexs
# ====================== # ======================
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.+)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
DEFAULT_SCHEME = 'http://' DEFAULT_SCHEME = 'http://'
# ====================== # ======================
@ -90,11 +90,9 @@ class WbUrl(BaseWbUrl):
self.original_url = url self.original_url = url
if not any(f(url) for f in [self._init_query, self._init_replay]): if not self._init_query(url):
raise Exception('Invalid WbUrl: ', url) if not self._init_replay(url):
raise Exception('Invalid WbUrl: ', url)
if len(self.url) == 0:
raise Exception('Invalid WbUrl: ', url)
# protocol agnostic url -> http:// # protocol agnostic url -> http://
# no protocol -> http:// # no protocol -> http://

View File

@ -21,9 +21,12 @@ ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
#================================================================= #=================================================================
class ArchiveLoadFailed(WbException): class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''): def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) if filename:
#self.filename = filename msg = filename + ':' + str(reason)
#self.reason = reason else:
msg = str(reason)
super(ArchiveLoadFailed, self).__init__(msg)
def status(self): def status(self):
return '503 Service Unavailable' return '503 Service Unavailable'

View File

@ -233,6 +233,19 @@ Exception: ArchiveLoadFailed
# Invalid WARC # Invalid WARC
>>> parse_stream_error(stream=None, statusline='ABC', known_format='warc') >>> parse_stream_error(stream=None, statusline='ABC', known_format='warc')
Exception: ArchiveLoadFailed Exception: ArchiveLoadFailed
# Revisit Errors
# original not found
>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz - - -', reraise=True)
Traceback (most recent call last):
ArchiveLoadFailed: Missing Revisit Original
# no revisit func available
>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX, revisit_func=None, reraise=True)
Traceback (most recent call last):
ArchiveLoadFailed: Original for revisit could not be loaded
""" """
import os import os
@ -281,16 +294,19 @@ def load_orig_cdx(self):
#============================================================================== #==============================================================================
def load_from_cdx_test(cdx): def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False):
resolve_loader = ResolvingLoader(test_warc_dir) resolve_loader = ResolvingLoader(test_warc_dir)
cdx = CDXObject(cdx) cdx = CDXObject(cdx)
try: try:
(headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx) (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, revisit_func)
print headers print headers
sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline())
sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline())
except ArchiveLoadFailed as e: except ArchiveLoadFailed as e:
print 'Exception: ' + e.__class__.__name__ if reraise:
raise
else:
print 'Exception: ' + e.__class__.__name__
#============================================================================== #==============================================================================

View File

@ -1,38 +1,38 @@
com,example)/ 20140127171200 zipnum 0 276 com,example)/ 20140127171200 zipnum 0 276 1
org,iana)/ 20140127171238 zipnum 276 328 org,iana)/ 20140127171238 zipnum 276 328 2
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 5
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 6
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 7
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 8
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 9
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 10
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 11
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 12
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 13
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 14
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 15
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 16
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 17
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 18
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 19
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 20
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 21
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 22
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 23
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 24
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 25
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 26
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 27
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 28
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 29
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 30
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 31
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 32
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 33
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 34
org,iana)/dnssec 20140126201307 zipnum 8511 373 org,iana)/dnssec 20140126201307 zipnum 8511 373 35
org,iana)/domains/int 20140126201239 zipnum 8884 353 org,iana)/domains/int 20140126201239 zipnum 8884 353 36
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
org,iana)/time-zones 20140126200737 zipnum 9623 145 org,iana)/time-zones 20140126200737 zipnum 9623 145 38

View File

@ -1,8 +1,10 @@
from pytest import raises
import webtest import webtest
from pywb.core.pywb_init import create_wb_router from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
class TestWb: class TestWb:
TEST_CONFIG = 'tests/test_config.yaml' TEST_CONFIG = 'tests/test_config.yaml'
@ -38,6 +40,16 @@ class TestWb:
self._assert_basic_html(resp) self._assert_basic_html(resp)
assert 'Search' in resp.body assert 'Search' in resp.body
def test_pywb_root_head(self):
resp = self.testapp.head('/pywb/')
assert resp.content_type == 'text/html'
assert resp.status_int == 200
def test_pywb_invalid_path(self):
resp = self.testapp.head('/blah/', status=404)
assert resp.content_type == 'text/html'
assert resp.status_int == 404
def test_calendar_query(self): def test_calendar_query(self):
resp = self.testapp.get('/pywb/*/iana.org') resp = self.testapp.get('/pywb/*/iana.org')
self._assert_basic_html(resp) self._assert_basic_html(resp)
@ -246,3 +258,10 @@ class TestWb:
assert resp.status_int == 400 assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body assert 'Invalid Url: http://?abc' in resp.body
def test_invalid_config(self):
with raises(IOError):
init_app(create_wb_router,
load_yaml=True,
config_file='x-invalid-x')