diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index ad2a4ac0..fb15c6c4 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -108,7 +108,7 @@ class FuzzyQuery: 'filter': filter_, 'output': output} - return CDXQuery(**params) + return params #================================================================= diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 24a34557..90443c85 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -58,15 +58,27 @@ class BaseCDXServer(object): fuzzy_query_params = self.fuzzy_query(query) if fuzzy_query_params: - return self.load_cdx_query(fuzzy_query_params) + return self.load_cdx(**fuzzy_query_params) msg = 'No Captures found for: ' + query.url + print self.fuzzy_query + print query.params raise NotFoundException(msg) def load_cdx(self, **params): - return self.load_cdx_query(CDXQuery(**params)) + query = CDXQuery(**params) - def load_cdx_query(self, query): + url = query.url + key, end_key = calc_search_range(url=url, + match_type=query.match_type, + url_canon=self.url_canon) + query.set_key(key, end_key) + + cdx_iter = self._load_cdx_query(query) + + return self._check_cdx_iter(cdx_iter, query) + + def _load_cdx_query(self, query): raise NotImplementedError('Implement in subclass') @staticmethod @@ -93,7 +105,7 @@ class CDXServer(BaseCDXServer): # config argument. self._create_cdx_sources(paths, kwargs.get('config')) - def load_cdx_query(self, query): + def _load_cdx_query(self, query): """ load CDX for query parameters ``params``. ``key`` (or ``url``) parameter specifies URL to query, @@ -107,17 +119,7 @@ class CDXServer(BaseCDXServer): :type query: :class:`~pywb.cdx.query.CDXQuery` :rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject` """ - url = query.url - key, end_key = calc_search_range(url=url, - match_type=query.match_type, - url_canon=self.url_canon) - query.set_key(key, end_key) - - cdx_iter = cdx_load(self.sources, - query) - #perms_checker=self.perms_checker) - - return self._check_cdx_iter(cdx_iter, query) + return cdx_load(self.sources, query) def _create_cdx_sources(self, paths, config): """ @@ -201,9 +203,8 @@ class RemoteCDXServer(BaseCDXServer): else: raise Exception('Invalid remote cdx source: ' + str(source)) - def load_cdx_query(self, query): - remote_iter = cdx_load([self.source], query, process=False) - return self._check_cdx_iter(remote_iter, query) + def _load_cdx_query(self, query): + return cdx_load([self.source], query, process=False) def __str__(self): return 'Remote CDX server serving from ' + str(self.sources[0]) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 04592c3e..97670365 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -2,6 +2,8 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.wbexception import AccessException, NotFoundException +from pywb.utils.wbexception import BadRequestException, WbException + from query import CDXQuery import urllib @@ -51,14 +53,14 @@ class RemoteCDXSource(CDXSource): if self.remote_processing: remote_query = query else: - # Only send url and matchType params to remote + # Only send url and matchType to remote remote_query = CDXQuery(url=query.url, match_type=query.match_type) urlparams = remote_query.urlencode() try: - request = urllib2.Request(self.remote_url, urlparams) + request = urllib2.Request(self.remote_url + '?' + urlparams) if self.cookie: request.add_header('Cookie', self.cookie) @@ -67,16 +69,15 @@ class RemoteCDXSource(CDXSource): except urllib2.HTTPError as e: if e.code == 403: - exc_msg = e.read() - msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg - else 'Excluded') - - raise AccessException(msg) + raise AccessException('Access Denied') elif e.code == 404: - msg = 'No Captures found for: ' + query.url - raise NotFoundException(msg) + # return empty list for consistency with other cdx sources + # will be converted to 404 if no other retry + return [] + elif e.code == 400: + raise BadRequestException() else: - raise + raise WbException('Invalid response from remote cdx server') return iter(response) diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py new file mode 100644 index 00000000..edfcd749 --- /dev/null +++ b/pywb/cdx/test/test_cdxops.py @@ -0,0 +1,170 @@ +#================================================================= +""" +# Merge Sort Multipe CDX Sources +>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz +org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz +org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz + + +# Limit CDX Stream +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz + + +# Reverse CDX Stream +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz + +>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1) +org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz + +# No matching results +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) +Traceback (most recent call last): +NotFoundException: No Captures found for: http://iana.org/dont_have_this + +# No matching -- limit=1 +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) +Traceback (most recent call last): +NotFoundException: No Captures found for: http://iana.org/dont_have_this + +# Filter cdx (default: regex) +>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) +org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz +org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz +org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz +org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz +org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz +org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz + +# Filter exact +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter exact invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + +# Filter contains +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter contains invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + +# Collapse by timestamp +# unresolved revisits, different statuscode results in an extra repeat +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz +org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz +org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz + +# resolved revisits +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True) +org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - +org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz + + +# Sort by closest timestamp + field select output +>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) +20140126200826 +20140126200816 +20140126200805 +20140126200912 +20140126200738 +20140126200930 +20140126200718 +20140126200706 +20140126200654 +20140126200625 + +# In case of both reverse and closest, closest takes precedence +# 'reverse closest' not supported at this time +# if it is, this test will reflect the change +>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True) +20140126200826 +20140126200816 +20140126200805 + +>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - + + +>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True) +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - + +# equal dist prefer earlier +>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2) +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz + +>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') +20140126200654 +20140126200706 + +>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') +20140126200706 +20140126200654 + + +# Resolve Revisits +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True) +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz + +>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) +org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - +org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - +""" + +#================================================================= +from pywb.cdx.cdxserver import CDXServer +import os +import sys + +from pywb import get_test_dir + +test_cdx_dir = get_test_dir() + 'cdx/' + + +def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): + kwparams['url'] = url + kwparams['output'] = 'cdxobject' + fields = kwparams.get('fields') + if fields: + fields = fields.split(',') + + server = CDXServer(sources) + results = server.load_cdx(**kwparams) + + for x in results: + l = x.to_text(fields).replace('\t', ' ') + sys.stdout.write(l) + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/cdx/test/test_cdxserver.py b/pywb/cdx/test/test_cdxserver.py index ebfffb91..61a4cce8 100644 --- a/pywb/cdx/test/test_cdxserver.py +++ b/pywb/cdx/test/test_cdxserver.py @@ -1,149 +1,23 @@ -#================================================================= -""" -# Merge Sort Multipe CDX Sources ->>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) -org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz -org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz -org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz +from pywb.apps.cdx_server import application +from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer +from pywb.utils.wbexception import AccessException, NotFoundException +from pywb.utils.wbexception import BadRequestException, WbException -# Limit CDX Stream ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz +from urllib2 import HTTPError +from mock import patch +from pytest import raises +import webtest -# Reverse CDX Stream ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz +from pywb import get_test_dir ->>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1) -org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz +TEST_CDX_DIR = get_test_dir() + 'cdx/' -# No matching results ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) -Traceback (most recent call last): -NotFoundException: No Captures found for: http://iana.org/dont_have_this +CDX_SERVER_URL = 'http://localhost/cdx' -# No matching -- limit=1 ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) -Traceback (most recent call last): -NotFoundException: No Captures found for: http://iana.org/dont_have_this - -# Filter cdx (default: regex) ->>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) -org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz -org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz -org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz -org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz -org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz -org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz -org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz -org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz -org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz - ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200') -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - -# Filter exact ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1') -com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz -com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz - -# Filter exact invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - -# Filter contains ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') -com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz -com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz - -# Filter contains invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') -com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz -com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz -com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz - -# Collapse by timestamp -# unresolved revisits, different statuscode results in an extra repeat ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz -org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz -org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz - -# resolved revisits ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True) -org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - -org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz - - -# Sort by closest timestamp + field select output ->>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10) -20140126200826 -20140126200816 -20140126200805 -20140126200912 -20140126200738 -20140126200930 -20140126200718 -20140126200706 -20140126200654 -20140126200625 - -# In case of both reverse and closest, closest takes precedence -# 'reverse closest' not supported at this time -# if it is, this test will reflect the change ->>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True) -20140126200826 -20140126200816 -20140126200805 - ->>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) -org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - -org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - - - ->>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True) -org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - -org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - - -# equal dist prefer earlier ->>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2) -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz - ->>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') -20140126200654 -20140126200706 - ->>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') -20140126200706 -20140126200654 - - -# Resolve Revisits ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True) -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz - ->>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) -org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - -org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - - - -# CDX Server init ->>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw') ->>> y = x.next(); pprint.pprint(x.next().items()) -[('urlkey', 'com,example)/'), +CDX_RESULT = [ + ('urlkey', 'com,example)/'), ('timestamp', '20140127171200'), ('original', 'http://example.com'), ('mimetype', 'text/html'), @@ -153,63 +27,128 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('robotflags', '-'), ('length', '1046'), ('offset', '334'), - ('filename', 'dupes.warc.gz')] + ('filename', 'dupes.warc.gz') +] -# NOTE: external dependency -- need self-contained test TODO +testapp = None -# Load remote query but filter locally ->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') ->>> pprint.pprint(x.next().items()) -[('urlkey', 'com,example)/'), - ('timestamp', '20020120142510'), - ('original', 'http://example.com:80/'), - ('mimetype', 'text/html'), - ('statuscode', '200'), - ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), - ('length', '1792')] - -# No local filtering/processing of cdx, simply return result from remote server ->>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') ->>> pprint.pprint(x.next().items()) -[('urlkey', 'com,example)/'), - ('timestamp', '20020120142510'), - ('original', 'http://example.com:80/'), - ('mimetype', 'text/html'), - ('statuscode', '200'), - ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), - ('length', '1792')] - ->>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') -Traceback (most recent call last): -AccessException: Blocked By Robots -""" - -#================================================================= -from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer -import os -import sys -import pprint - -from pywb import get_test_dir - -test_cdx_dir = get_test_dir() + 'cdx/' +def setup_module(self): + global testapp + testapp = webtest.TestApp(application) -def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): - kwparams['url'] = url - kwparams['output'] = 'cdxobject' - fields = kwparams.get('fields') - if fields: - fields = fields.split(',') +def mock_urlopen(req): + resp = testapp.get(req.get_full_url()) + return resp.body.split('\n') - server = CDXServer(sources) - results = server.load_cdx(**kwparams) +def mock_urlopen_err(err): + def make_err(req): + raise HTTPError(req.get_full_url(), err, None, None, None) + return make_err - for x in results: - l = x.to_text(fields).replace('\t', ' ') - sys.stdout.write(l) +# First time expect a 404 when called with 'exact', +# Second time expect a 200 for fuzzy match +def mock_urlopen_fuzzy(req): + status = 200 + if 'exact' in req.get_full_url(): + status = 404 + + resp = testapp.get(req.get_full_url(), status=status) + + if status == 200: + return resp.body.split('\n') + else: + raise mock_urlopen_err(404)(req) + +@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen) +def assert_cdx_match(server): + x = server.load_cdx(url='example.com', + limit=2, + output='cdxobject') + x.next() + assert x.next().items() == CDX_RESULT -if __name__ == "__main__": - import doctest - doctest.testmod() +def assert_cdx_fuzzy_match(server, mock=mock_urlopen): + with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock): + x = server.load_cdx(url='http://example.com?_=123', + limit=2, + output='cdxobject', + allowFuzzy=True) + x.next() + assert x.next().items() == CDX_RESULT + + +@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404)) +def assert_404(server): + server.load_cdx(url='http://notfound.example.com') + + +@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403)) +def assert_403(server): + server.load_cdx(url='http://notfound.example.com') + + +@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400)) +def assert_400(server): + server.load_cdx(url='http://notfound.example.com') + + +@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502)) +def assert_502(server): + server.load_cdx(url='http://notfound.example.com') + + +def test_match(): + # Local CDX Server + assert_cdx_match(CDXServer([TEST_CDX_DIR])) + + # Remote CDX Source, Local Filtering + assert_cdx_match(CDXServer(CDX_SERVER_URL)) + + # Remote CDX Query (Remote Filtering) + assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL)) + + +# TODO: make these automatic +DEFAULT_RULES = 'pywb/rules.yaml' + +def test_fuzzy_match(): + # Local CDX Server + assert_cdx_fuzzy_match(CDXServer([TEST_CDX_DIR], + ds_rules_file=DEFAULT_RULES)) + + # Remote CDX Source, Local Filtering + # two calls to remote, first exact with 404, + # then fuzzy with 200 + assert_cdx_fuzzy_match(CDXServer(CDX_SERVER_URL, + ds_rules_file=DEFAULT_RULES), + mock_urlopen_fuzzy) + + # Remote CDX Query (Remote Filtering) + # fuzzy match handled on remote, single response + assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL, + ds_rules_file=DEFAULT_RULES)) + +def assert_error(func, exception): + with raises(exception): + func(CDXServer(CDX_SERVER_URL)) + + with raises(exception): + func(RemoteCDXServer(CDX_SERVER_URL)) + +def test_err_404(): + # Test local for consistency + with raises(NotFoundException): + assert_404(CDXServer([TEST_CDX_DIR])) + + assert_error(assert_404, NotFoundException) + +def test_err_403(): + assert_error(assert_403, AccessException) + +def test_err_400(): + assert_error(assert_400, BadRequestException) + +def test_err_502(): + assert_error(assert_502, WbException) diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index 3b1de326..6e303740 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -23,7 +23,7 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s """ -from test_cdxserver import cdx_ops_test +from test_cdxops import cdx_ops_test from pywb import get_test_dir