mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
remote cdx refactoring: refactor remote cdx source and server to support
fuzzy matching test local cdx server, remote cdx source, local and remote filtering with self-contained unit tests map remote cdx httperrors to pywb exceptions
This commit is contained in:
parent
5847087aae
commit
4e53c2e9d8
@ -108,7 +108,7 @@ class FuzzyQuery:
|
||||
'filter': filter_,
|
||||
'output': output}
|
||||
|
||||
return CDXQuery(**params)
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -58,15 +58,27 @@ class BaseCDXServer(object):
|
||||
|
||||
fuzzy_query_params = self.fuzzy_query(query)
|
||||
if fuzzy_query_params:
|
||||
return self.load_cdx_query(fuzzy_query_params)
|
||||
return self.load_cdx(**fuzzy_query_params)
|
||||
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
print self.fuzzy_query
|
||||
print query.params
|
||||
raise NotFoundException(msg)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
return self.load_cdx_query(CDXQuery(**params))
|
||||
query = CDXQuery(**params)
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
url = query.url
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=query.match_type,
|
||||
url_canon=self.url_canon)
|
||||
query.set_key(key, end_key)
|
||||
|
||||
cdx_iter = self._load_cdx_query(query)
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, query)
|
||||
|
||||
def _load_cdx_query(self, query):
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
@staticmethod
|
||||
@ -93,7 +105,7 @@ class CDXServer(BaseCDXServer):
|
||||
# config argument.
|
||||
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
def _load_cdx_query(self, query):
|
||||
"""
|
||||
load CDX for query parameters ``params``.
|
||||
``key`` (or ``url``) parameter specifies URL to query,
|
||||
@ -107,17 +119,7 @@ class CDXServer(BaseCDXServer):
|
||||
:type query: :class:`~pywb.cdx.query.CDXQuery`
|
||||
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
|
||||
"""
|
||||
url = query.url
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=query.match_type,
|
||||
url_canon=self.url_canon)
|
||||
query.set_key(key, end_key)
|
||||
|
||||
cdx_iter = cdx_load(self.sources,
|
||||
query)
|
||||
#perms_checker=self.perms_checker)
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, query)
|
||||
return cdx_load(self.sources, query)
|
||||
|
||||
def _create_cdx_sources(self, paths, config):
|
||||
"""
|
||||
@ -201,9 +203,8 @@ class RemoteCDXServer(BaseCDXServer):
|
||||
else:
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
remote_iter = cdx_load([self.source], query, process=False)
|
||||
return self._check_cdx_iter(remote_iter, query)
|
||||
def _load_cdx_query(self, query):
|
||||
return cdx_load([self.source], query, process=False)
|
||||
|
||||
def __str__(self):
|
||||
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||
|
@ -2,6 +2,8 @@ from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
|
||||
from query import CDXQuery
|
||||
|
||||
import urllib
|
||||
@ -51,14 +53,14 @@ class RemoteCDXSource(CDXSource):
|
||||
if self.remote_processing:
|
||||
remote_query = query
|
||||
else:
|
||||
# Only send url and matchType params to remote
|
||||
# Only send url and matchType to remote
|
||||
remote_query = CDXQuery(url=query.url,
|
||||
match_type=query.match_type)
|
||||
|
||||
urlparams = remote_query.urlencode()
|
||||
|
||||
try:
|
||||
request = urllib2.Request(self.remote_url, urlparams)
|
||||
request = urllib2.Request(self.remote_url + '?' + urlparams)
|
||||
|
||||
if self.cookie:
|
||||
request.add_header('Cookie', self.cookie)
|
||||
@ -67,16 +69,15 @@ class RemoteCDXSource(CDXSource):
|
||||
|
||||
except urllib2.HTTPError as e:
|
||||
if e.code == 403:
|
||||
exc_msg = e.read()
|
||||
msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
|
||||
else 'Excluded')
|
||||
|
||||
raise AccessException(msg)
|
||||
raise AccessException('Access Denied')
|
||||
elif e.code == 404:
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
raise NotFoundException(msg)
|
||||
# return empty list for consistency with other cdx sources
|
||||
# will be converted to 404 if no other retry
|
||||
return []
|
||||
elif e.code == 400:
|
||||
raise BadRequestException()
|
||||
else:
|
||||
raise
|
||||
raise WbException('Invalid response from remote cdx server')
|
||||
|
||||
return iter(response)
|
||||
|
||||
|
170
pywb/cdx/test/test_cdxops.py
Normal file
170
pywb/cdx/test/test_cdxops.py
Normal file
@ -0,0 +1,170 @@
|
||||
#=================================================================
|
||||
"""
|
||||
# Merge Sort Multipe CDX Sources
|
||||
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
|
||||
|
||||
# Limit CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||
|
||||
|
||||
# Reverse CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# No matching -- limit=1
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter exact
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter contains
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
|
||||
# Sort by closest timestamp + field select output
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
20140126200912
|
||||
20140126200738
|
||||
20140126200930
|
||||
20140126200718
|
||||
20140126200706
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
# In case of both reverse and closest, closest takes precedence
|
||||
# 'reverse closest' not supported at this time
|
||||
# if it is, this test will reflect the change
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
|
||||
# Resolve Revisits
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'cdxobject'
|
||||
fields = kwparams.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
|
||||
server = CDXServer(sources)
|
||||
results = server.load_cdx(**kwparams)
|
||||
|
||||
for x in results:
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
sys.stdout.write(l)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,149 +1,23 @@
|
||||
#=================================================================
|
||||
"""
|
||||
# Merge Sort Multipe CDX Sources
|
||||
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
from pywb.apps.cdx_server import application
|
||||
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
||||
|
||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
|
||||
# Limit CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
|
||||
from urllib2 import HTTPError
|
||||
|
||||
from mock import patch
|
||||
from pytest import raises
|
||||
import webtest
|
||||
|
||||
# Reverse CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
from pywb import get_test_dir
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
CDX_SERVER_URL = 'http://localhost/cdx'
|
||||
|
||||
# No matching -- limit=1
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
|
||||
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
|
||||
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter exact
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter contains
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
|
||||
# Sort by closest timestamp + field select output
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
20140126200912
|
||||
20140126200738
|
||||
20140126200930
|
||||
20140126200718
|
||||
20140126200706
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
# In case of both reverse and closest, closest takes precedence
|
||||
# 'reverse closest' not supported at this time
|
||||
# if it is, this test will reflect the change
|
||||
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
|
||||
20140126200826
|
||||
20140126200816
|
||||
20140126200805
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
|
||||
# Resolve Revisits
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
|
||||
|
||||
# CDX Server init
|
||||
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw')
|
||||
>>> y = x.next(); pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
CDX_RESULT = [
|
||||
('urlkey', 'com,example)/'),
|
||||
('timestamp', '20140127171200'),
|
||||
('original', 'http://example.com'),
|
||||
('mimetype', 'text/html'),
|
||||
@ -153,63 +27,128 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('robotflags', '-'),
|
||||
('length', '1046'),
|
||||
('offset', '334'),
|
||||
('filename', 'dupes.warc.gz')]
|
||||
('filename', 'dupes.warc.gz')
|
||||
]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test TODO
|
||||
testapp = None
|
||||
|
||||
# Load remote query but filter locally
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
# No local filtering/processing of cdx, simply return result from remote server
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
Traceback (most recent call last):
|
||||
AccessException: Blocked By Robots
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
||||
import os
|
||||
import sys
|
||||
import pprint
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
def setup_module(self):
|
||||
global testapp
|
||||
testapp = webtest.TestApp(application)
|
||||
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'cdxobject'
|
||||
fields = kwparams.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
def mock_urlopen(req):
|
||||
resp = testapp.get(req.get_full_url())
|
||||
return resp.body.split('\n')
|
||||
|
||||
server = CDXServer(sources)
|
||||
results = server.load_cdx(**kwparams)
|
||||
def mock_urlopen_err(err):
|
||||
def make_err(req):
|
||||
raise HTTPError(req.get_full_url(), err, None, None, None)
|
||||
return make_err
|
||||
|
||||
for x in results:
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
sys.stdout.write(l)
|
||||
# First time expect a 404 when called with 'exact',
|
||||
# Second time expect a 200 for fuzzy match
|
||||
def mock_urlopen_fuzzy(req):
|
||||
status = 200
|
||||
if 'exact' in req.get_full_url():
|
||||
status = 404
|
||||
|
||||
resp = testapp.get(req.get_full_url(), status=status)
|
||||
|
||||
if status == 200:
|
||||
return resp.body.split('\n')
|
||||
else:
|
||||
raise mock_urlopen_err(404)(req)
|
||||
|
||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen)
|
||||
def assert_cdx_match(server):
|
||||
x = server.load_cdx(url='example.com',
|
||||
limit=2,
|
||||
output='cdxobject')
|
||||
x.next()
|
||||
assert x.next().items() == CDX_RESULT
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
|
||||
with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock):
|
||||
x = server.load_cdx(url='http://example.com?_=123',
|
||||
limit=2,
|
||||
output='cdxobject',
|
||||
allowFuzzy=True)
|
||||
x.next()
|
||||
assert x.next().items() == CDX_RESULT
|
||||
|
||||
|
||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404))
|
||||
def assert_404(server):
|
||||
server.load_cdx(url='http://notfound.example.com')
|
||||
|
||||
|
||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403))
|
||||
def assert_403(server):
|
||||
server.load_cdx(url='http://notfound.example.com')
|
||||
|
||||
|
||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400))
|
||||
def assert_400(server):
|
||||
server.load_cdx(url='http://notfound.example.com')
|
||||
|
||||
|
||||
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502))
|
||||
def assert_502(server):
|
||||
server.load_cdx(url='http://notfound.example.com')
|
||||
|
||||
|
||||
def test_match():
|
||||
# Local CDX Server
|
||||
assert_cdx_match(CDXServer([TEST_CDX_DIR]))
|
||||
|
||||
# Remote CDX Source, Local Filtering
|
||||
assert_cdx_match(CDXServer(CDX_SERVER_URL))
|
||||
|
||||
# Remote CDX Query (Remote Filtering)
|
||||
assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL))
|
||||
|
||||
|
||||
# TODO: make these automatic
|
||||
DEFAULT_RULES = 'pywb/rules.yaml'
|
||||
|
||||
def test_fuzzy_match():
|
||||
# Local CDX Server
|
||||
assert_cdx_fuzzy_match(CDXServer([TEST_CDX_DIR],
|
||||
ds_rules_file=DEFAULT_RULES))
|
||||
|
||||
# Remote CDX Source, Local Filtering
|
||||
# two calls to remote, first exact with 404,
|
||||
# then fuzzy with 200
|
||||
assert_cdx_fuzzy_match(CDXServer(CDX_SERVER_URL,
|
||||
ds_rules_file=DEFAULT_RULES),
|
||||
mock_urlopen_fuzzy)
|
||||
|
||||
# Remote CDX Query (Remote Filtering)
|
||||
# fuzzy match handled on remote, single response
|
||||
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
|
||||
ds_rules_file=DEFAULT_RULES))
|
||||
|
||||
def assert_error(func, exception):
|
||||
with raises(exception):
|
||||
func(CDXServer(CDX_SERVER_URL))
|
||||
|
||||
with raises(exception):
|
||||
func(RemoteCDXServer(CDX_SERVER_URL))
|
||||
|
||||
def test_err_404():
|
||||
# Test local for consistency
|
||||
with raises(NotFoundException):
|
||||
assert_404(CDXServer([TEST_CDX_DIR]))
|
||||
|
||||
assert_error(assert_404, NotFoundException)
|
||||
|
||||
def test_err_403():
|
||||
assert_error(assert_403, AccessException)
|
||||
|
||||
def test_err_400():
|
||||
assert_error(assert_400, BadRequestException)
|
||||
|
||||
def test_err_502():
|
||||
assert_error(assert_502, WbException)
|
||||
|
@ -23,7 +23,7 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
|
||||
|
||||
"""
|
||||
|
||||
from test_cdxserver import cdx_ops_test
|
||||
from test_cdxops import cdx_ops_test
|
||||
from pywb import get_test_dir
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user