From 91184426b78ac11a10469aac815c288923bfeaf4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Apr 2014 13:16:54 -0700 Subject: [PATCH] test coverage pass: refactor and cleanup to improve coverage for corner cases --- pywb/cdx/cdxobject.py | 4 +- pywb/cdx/test/test_zipnum.py | 6 +- pywb/core/replay_views.py | 22 +++--- pywb/framework/test/test_wbrequestresponse.py | 4 + pywb/framework/wbrequestresponse.py | 10 +-- pywb/framework/wsgi_wrappers.py | 24 +++--- pywb/rewrite/test/test_wburl.py | 4 + pywb/rewrite/wburl.py | 12 ++- pywb/warc/recordloader.py | 9 ++- pywb/warc/test/test_loading.py | 22 +++++- sample_archive/zipcdx/zipnum-sample.idx | 76 +++++++++---------- tests/test_integration.py | 19 +++++ 12 files changed, 126 insertions(+), 86 deletions(-) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 02b57c0e..e90d6567 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -1,6 +1,6 @@ -try: +try: # pragma: no cover from collections import OrderedDict -except ImportError: +except ImportError: # pragma: no cover from ordereddict import OrderedDict import itertools diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index 6e303740..95079d52 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -6,9 +6,9 @@ org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3 # test idx index (tabs replacad with 4 spaces) >>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) -org,iana)/dnssec 20140126201307 zipnum 8511 373 -org,iana)/domains/int 20140126201239 zipnum 8884 353 -org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 +org,iana)/dnssec 20140126201307 zipnum 8511 373 35 +org,iana)/domains/int 20140126201239 zipnum 8884 353 36 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 >>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 2c9d0278..cf49f4a4 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -109,7 +109,7 @@ class ReplayView: response = None - if self.content_rewriter and wbrequest.wb_url.mod != 'id_': + if self.content_rewriter and not wbrequest.is_identity: response = self.rewrite_content(wbrequest, cdx, @@ -182,7 +182,7 @@ class ReplayView: (status_headers, response_gen) = result if self.buffer_response: - if wbrequest.wb_url.mod == 'id_': + if wbrequest.is_identity: status_headers.remove_header('content-length') response_gen = self.buffered_response(status_headers, response_gen) @@ -244,7 +244,7 @@ class ReplayView: # skip all 304s if (status_headers.statusline.startswith('304') and - not wbrequest.wb_url.mod == 'id_'): + not wbrequest.is_identity): raise CaptureException('Skipping 304 Modified: ' + str(cdx)) @@ -298,16 +298,18 @@ class ReplayView: >>> ReplayView.strip_scheme('about://example.com') ==\ ReplayView.strip_scheme('example.com') True + + >>> ReplayView.strip_scheme('http://') ==\ + ReplayView.strip_scheme('') + True + + >>> ReplayView.strip_scheme('#!@?') ==\ + ReplayView.strip_scheme('#!@?') + True """ m = ReplayView.STRIP_SCHEME.match(url) - if not m: - return url - match = m.group(2) - if match: - return match - else: - return url + return match if __name__ == "__main__": diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index e9a4ca9e..f090a6ae 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -10,6 +10,10 @@ >>> print_req_from_uri('/2010/example.com') {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} +# ajax +>>> print_req_from_uri('', {'REL_REQUEST_URI': '/2010/example.com', 'HTTP_HOST': 'localhost:8080', 'HTTP_X_REQUESTED_WITH': 'XMLHttpRequest'}) +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + >>> print_req_from_uri('../example.com') {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 991f42a2..ba1f6a02 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -100,14 +100,12 @@ class WbRequest(object): def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') - if not value: - return False - - if value.lower() == 'xmlhttprequest': + if value and value.lower() == 'xmlhttprequest': return True - if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')): - return True + #if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')): + # return True + return False def __repr__(self): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 04931958..8a42e101 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -65,9 +65,6 @@ class WSGIApp(object): msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) raise NotFoundException(msg) -# except InternalRedirect as ir: -# return ir.response - except WbException as e: response = handle_exception(env, wb_router, e, False) @@ -115,19 +112,18 @@ def init_app(init_func, load_yaml=True, config_file=None): level=logging.DEBUG) logging.debug('') - if load_yaml: - # env setting overrides all others - env_config = os.environ.get('PYWB_CONFIG_FILE') - if env_config: - config_file = env_config - - if not config_file: - config_file = DEFAULT_CONFIG_FILE - - config = load_yaml_config(config_file) - try: if load_yaml: + # env setting overrides all others + env_config = os.environ.get('PYWB_CONFIG_FILE') + if env_config: + config_file = env_config + + if not config_file: + config_file = DEFAULT_CONFIG_FILE + + config = load_yaml_config(config_file) + wb_router = init_func(config) else: wb_router = init_func() diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index f4183bac..955e24df 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -68,6 +68,10 @@ Exception: Bad Request Url: http://#$%#/ Traceback (most recent call last): Exception: Bad Request Url: http://example.com:abc/ +>>> x = WbUrl('') +Traceback (most recent call last): +Exception: ('Invalid WbUrl: ', '') + # considered blank >>> x = WbUrl('https:/') >>> x = WbUrl('https:///') diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 1a91393a..67bab4fb 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -79,8 +79,8 @@ class BaseWbUrl(object): class WbUrl(BaseWbUrl): # Regexs # ====================== - QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.*)$') - REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') + QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)(?:-(\d+))?\*/?(.+)$') + REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$') DEFAULT_SCHEME = 'http://' # ====================== @@ -90,11 +90,9 @@ class WbUrl(BaseWbUrl): self.original_url = url - if not any(f(url) for f in [self._init_query, self._init_replay]): - raise Exception('Invalid WbUrl: ', url) - - if len(self.url) == 0: - raise Exception('Invalid WbUrl: ', url) + if not self._init_query(url): + if not self._init_replay(url): + raise Exception('Invalid WbUrl: ', url) # protocol agnostic url -> http:// # no protocol -> http:// diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 11524fed..96e149e3 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -21,9 +21,12 @@ ArcWarcRecord = collections.namedtuple('ArcWarcRecord', #================================================================= class ArchiveLoadFailed(WbException): def __init__(self, reason, filename=''): - super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) - #self.filename = filename - #self.reason = reason + if filename: + msg = filename + ':' + str(reason) + else: + msg = str(reason) + + super(ArchiveLoadFailed, self).__init__(msg) def status(self): return '503 Service Unavailable' diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 146dfb01..d896f120 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -233,6 +233,19 @@ Exception: ArchiveLoadFailed # Invalid WARC >>> parse_stream_error(stream=None, statusline='ABC', known_format='warc') Exception: ArchiveLoadFailed + +# Revisit Errors +# original not found +>>> load_from_cdx_test('com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - - 1864 example.warc.gz - - -', reraise=True) +Traceback (most recent call last): +ArchiveLoadFailed: Missing Revisit Original + +# no revisit func available +>>> load_from_cdx_test(URL_AGNOSTIC_REVISIT_CDX, revisit_func=None, reraise=True) +Traceback (most recent call last): +ArchiveLoadFailed: Original for revisit could not be loaded + + """ import os @@ -281,16 +294,19 @@ def load_orig_cdx(self): #============================================================================== -def load_from_cdx_test(cdx): +def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False): resolve_loader = ResolvingLoader(test_warc_dir) cdx = CDXObject(cdx) try: - (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, load_orig_cdx) + (headers, stream) = resolve_loader.resolve_headers_and_payload(cdx, None, revisit_func) print headers sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline()) except ArchiveLoadFailed as e: - print 'Exception: ' + e.__class__.__name__ + if reraise: + raise + else: + print 'Exception: ' + e.__class__.__name__ #============================================================================== diff --git a/sample_archive/zipcdx/zipnum-sample.idx b/sample_archive/zipcdx/zipnum-sample.idx index a70d8e87..6697c131 100644 --- a/sample_archive/zipcdx/zipnum-sample.idx +++ b/sample_archive/zipcdx/zipnum-sample.idx @@ -1,38 +1,38 @@ -com,example)/ 20140127171200 zipnum 0 276 -org,iana)/ 20140127171238 zipnum 276 328 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 -org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 -org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 -org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 -org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 -org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 -org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 -org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 -org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 -org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 -org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 -org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 -org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 -org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 -org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 -org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 -org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 -org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 -org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 -org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 -org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 -org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 -org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 -org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 -org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 -org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 -org,iana)/dnssec 20140126201307 zipnum 8511 373 -org,iana)/domains/int 20140126201239 zipnum 8884 353 -org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 -org,iana)/time-zones 20140126200737 zipnum 9623 145 +com,example)/ 20140127171200 zipnum 0 276 1 +org,iana)/ 20140127171238 zipnum 276 328 2 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 5 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 6 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 7 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 8 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 9 +org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140127171240 zipnum 2394 312 10 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 11 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 12 +org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 13 +org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 14 +org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 15 +org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 16 +org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 17 +org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 18 +org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 19 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 20 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 21 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 22 +org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 23 +org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 24 +org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 25 +org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 26 +org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 27 +org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 28 +org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 29 +org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 30 +org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 31 +org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 32 +org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 33 +org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 34 +org,iana)/dnssec 20140126201307 zipnum 8511 373 35 +org,iana)/domains/int 20140126201239 zipnum 8884 353 36 +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 +org,iana)/time-zones 20140126200737 zipnum 9623 145 38 diff --git a/tests/test_integration.py b/tests/test_integration.py index 017a0ca8..8adf994e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,8 +1,10 @@ +from pytest import raises import webtest from pywb.core.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject + class TestWb: TEST_CONFIG = 'tests/test_config.yaml' @@ -38,6 +40,16 @@ class TestWb: self._assert_basic_html(resp) assert 'Search' in resp.body + def test_pywb_root_head(self): + resp = self.testapp.head('/pywb/') + assert resp.content_type == 'text/html' + assert resp.status_int == 200 + + def test_pywb_invalid_path(self): + resp = self.testapp.head('/blah/', status=404) + assert resp.content_type == 'text/html' + assert resp.status_int == 404 + def test_calendar_query(self): resp = self.testapp.get('/pywb/*/iana.org') self._assert_basic_html(resp) @@ -246,3 +258,10 @@ class TestWb: assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body + def test_invalid_config(self): + with raises(IOError): + init_app(create_wb_router, + load_yaml=True, + config_file='x-invalid-x') + +