1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

remote cdx refactoring: refactor remote cdx source and server to support

fuzzy matching
test local cdx server, remote cdx source, local and remote filtering
with self-contained unit tests
map remote cdx httperrors to pywb exceptions
This commit is contained in:
Ilya Kreymer 2014-03-26 11:33:46 -07:00
parent 5847087aae
commit 4e53c2e9d8
6 changed files with 332 additions and 221 deletions

View File

@ -108,7 +108,7 @@ class FuzzyQuery:
'filter': filter_, 'filter': filter_,
'output': output} 'output': output}
return CDXQuery(**params) return params
#================================================================= #=================================================================

View File

@ -58,15 +58,27 @@ class BaseCDXServer(object):
fuzzy_query_params = self.fuzzy_query(query) fuzzy_query_params = self.fuzzy_query(query)
if fuzzy_query_params: if fuzzy_query_params:
return self.load_cdx_query(fuzzy_query_params) return self.load_cdx(**fuzzy_query_params)
msg = 'No Captures found for: ' + query.url msg = 'No Captures found for: ' + query.url
print self.fuzzy_query
print query.params
raise NotFoundException(msg) raise NotFoundException(msg)
def load_cdx(self, **params): def load_cdx(self, **params):
return self.load_cdx_query(CDXQuery(**params)) query = CDXQuery(**params)
def load_cdx_query(self, query): url = query.url
key, end_key = calc_search_range(url=url,
match_type=query.match_type,
url_canon=self.url_canon)
query.set_key(key, end_key)
cdx_iter = self._load_cdx_query(query)
return self._check_cdx_iter(cdx_iter, query)
def _load_cdx_query(self, query):
raise NotImplementedError('Implement in subclass') raise NotImplementedError('Implement in subclass')
@staticmethod @staticmethod
@ -93,7 +105,7 @@ class CDXServer(BaseCDXServer):
# config argument. # config argument.
self._create_cdx_sources(paths, kwargs.get('config')) self._create_cdx_sources(paths, kwargs.get('config'))
def load_cdx_query(self, query): def _load_cdx_query(self, query):
""" """
load CDX for query parameters ``params``. load CDX for query parameters ``params``.
``key`` (or ``url``) parameter specifies URL to query, ``key`` (or ``url``) parameter specifies URL to query,
@ -107,17 +119,7 @@ class CDXServer(BaseCDXServer):
:type query: :class:`~pywb.cdx.query.CDXQuery` :type query: :class:`~pywb.cdx.query.CDXQuery`
:rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject` :rtype: iterator on :class:`~pywb.cdx.cdxobject.CDXObject`
""" """
url = query.url return cdx_load(self.sources, query)
key, end_key = calc_search_range(url=url,
match_type=query.match_type,
url_canon=self.url_canon)
query.set_key(key, end_key)
cdx_iter = cdx_load(self.sources,
query)
#perms_checker=self.perms_checker)
return self._check_cdx_iter(cdx_iter, query)
def _create_cdx_sources(self, paths, config): def _create_cdx_sources(self, paths, config):
""" """
@ -201,9 +203,8 @@ class RemoteCDXServer(BaseCDXServer):
else: else:
raise Exception('Invalid remote cdx source: ' + str(source)) raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx_query(self, query): def _load_cdx_query(self, query):
remote_iter = cdx_load([self.source], query, process=False) return cdx_load([self.source], query, process=False)
return self._check_cdx_iter(remote_iter, query)
def __str__(self): def __str__(self):
return 'Remote CDX server serving from ' + str(self.sources[0]) return 'Remote CDX server serving from ' + str(self.sources[0])

View File

@ -2,6 +2,8 @@ from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException
from query import CDXQuery from query import CDXQuery
import urllib import urllib
@ -51,14 +53,14 @@ class RemoteCDXSource(CDXSource):
if self.remote_processing: if self.remote_processing:
remote_query = query remote_query = query
else: else:
# Only send url and matchType params to remote # Only send url and matchType to remote
remote_query = CDXQuery(url=query.url, remote_query = CDXQuery(url=query.url,
match_type=query.match_type) match_type=query.match_type)
urlparams = remote_query.urlencode() urlparams = remote_query.urlencode()
try: try:
request = urllib2.Request(self.remote_url, urlparams) request = urllib2.Request(self.remote_url + '?' + urlparams)
if self.cookie: if self.cookie:
request.add_header('Cookie', self.cookie) request.add_header('Cookie', self.cookie)
@ -67,16 +69,15 @@ class RemoteCDXSource(CDXSource):
except urllib2.HTTPError as e: except urllib2.HTTPError as e:
if e.code == 403: if e.code == 403:
exc_msg = e.read() raise AccessException('Access Denied')
msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
else 'Excluded')
raise AccessException(msg)
elif e.code == 404: elif e.code == 404:
msg = 'No Captures found for: ' + query.url # return empty list for consistency with other cdx sources
raise NotFoundException(msg) # will be converted to 404 if no other retry
return []
elif e.code == 400:
raise BadRequestException()
else: else:
raise raise WbException('Invalid response from remote cdx server')
return iter(response) return iter(response)

View File

@ -0,0 +1,170 @@
#=================================================================
"""
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# Limit CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter exact
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
# In case of both reverse and closest, closest takes precedence
# 'reverse closest' not supported at this time
# if it is, this test will reflect the change
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
20140126200826
20140126200816
20140126200805
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
"""
#=================================================================
from pywb.cdx.cdxserver import CDXServer
import os
import sys
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
kwparams['output'] = 'cdxobject'
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')
server = CDXServer(sources)
results = server.load_cdx(**kwparams)
for x in results:
l = x.to_text(fields).replace('\t', ' ')
sys.stdout.write(l)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,149 +1,23 @@
#================================================================= from pywb.apps.cdx_server import application
""" from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
# Merge Sort Multipe CDX Sources
>>> cdx_ops_test(url = 'http://iana.org/', sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException
# Limit CDX Stream from urllib2 import HTTPError
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200625 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf warc/revisit - YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz
from mock import patch
from pytest import raises
import webtest
# Reverse CDX Stream from pywb import get_test_dir
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1) TEST_CDX_DIR = get_test_dir() + 'cdx/'
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results CDX_SERVER_URL = 'http://localhost/cdx'
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# No matching -- limit=1 CDX_RESULT = [
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 1) ('urlkey', 'com,example)/'),
Traceback (most recent call last):
NotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
org,iana)/domains/int 20140126201239 http://www.iana.org/domains/int text/html 200 X32BBNNORV4SPEHTQF5KI5NFHSKTZK6Q - - 2482 746788 iana.warc.gz
org,iana)/domains/reserved 20140126201054 http://www.iana.org/domains/reserved text/html 200 R5AAEQX5XY5X5DG66B23ODN5DUBWRA27 - - 3573 701457 iana.warc.gz
org,iana)/domains/root 20140126200912 http://www.iana.org/domains/root text/html 200 YWA2R6UVWCYNHBZJKBTPYPZ5CJWKGGUX - - 2691 657746 iana.warc.gz
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter exact
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
# Sort by closest timestamp + field select output
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 10)
20140126200826
20140126200816
20140126200805
20140126200912
20140126200738
20140126200930
20140126200718
20140126200706
20140126200654
20140126200625
# In case of both reverse and closest, closest takes precedence
# 'reverse closest' not supported at this time
# if it is, this test will reflect the change
>>> cdx_ops_test(closest = '20140126200826', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', fields = 'timestamp', limit = 3, reverse = True)
20140126200826
20140126200816
20140126200805
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
# CDX Server init
>>> x = CDXServer([test_cdx_dir]).load_cdx(url = 'example.com', limit = 2, output = 'raw')
>>> y = x.next(); pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20140127171200'), ('timestamp', '20140127171200'),
('original', 'http://example.com'), ('original', 'http://example.com'),
('mimetype', 'text/html'), ('mimetype', 'text/html'),
@ -153,63 +27,128 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('robotflags', '-'), ('robotflags', '-'),
('length', '1046'), ('length', '1046'),
('offset', '334'), ('offset', '334'),
('filename', 'dupes.warc.gz')] ('filename', 'dupes.warc.gz')
]
# NOTE: external dependency -- need self-contained test TODO testapp = None
# Load remote query but filter locally def setup_module(self):
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') global testapp
>>> pprint.pprint(x.next().items()) testapp = webtest.TestApp(application)
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
# No local filtering/processing of cdx, simply return result from remote server
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last):
AccessException: Blocked By Robots
"""
#=================================================================
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
import os
import sys
import pprint
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): def mock_urlopen(req):
kwparams['url'] = url resp = testapp.get(req.get_full_url())
kwparams['output'] = 'cdxobject' return resp.body.split('\n')
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')
server = CDXServer(sources) def mock_urlopen_err(err):
results = server.load_cdx(**kwparams) def make_err(req):
raise HTTPError(req.get_full_url(), err, None, None, None)
return make_err
for x in results: # First time expect a 404 when called with 'exact',
l = x.to_text(fields).replace('\t', ' ') # Second time expect a 200 for fuzzy match
sys.stdout.write(l) def mock_urlopen_fuzzy(req):
status = 200
if 'exact' in req.get_full_url():
status = 404
resp = testapp.get(req.get_full_url(), status=status)
if status == 200:
return resp.body.split('\n')
else:
raise mock_urlopen_err(404)(req)
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen)
def assert_cdx_match(server):
x = server.load_cdx(url='example.com',
limit=2,
output='cdxobject')
x.next()
assert x.next().items() == CDX_RESULT
if __name__ == "__main__": def assert_cdx_fuzzy_match(server, mock=mock_urlopen):
import doctest with patch('pywb.cdx.cdxsource.urllib2.urlopen', mock):
doctest.testmod() x = server.load_cdx(url='http://example.com?_=123',
limit=2,
output='cdxobject',
allowFuzzy=True)
x.next()
assert x.next().items() == CDX_RESULT
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(404))
def assert_404(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(403))
def assert_403(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(400))
def assert_400(server):
server.load_cdx(url='http://notfound.example.com')
@patch('pywb.cdx.cdxsource.urllib2.urlopen', mock_urlopen_err(502))
def assert_502(server):
server.load_cdx(url='http://notfound.example.com')
def test_match():
# Local CDX Server
assert_cdx_match(CDXServer([TEST_CDX_DIR]))
# Remote CDX Source, Local Filtering
assert_cdx_match(CDXServer(CDX_SERVER_URL))
# Remote CDX Query (Remote Filtering)
assert_cdx_match(RemoteCDXServer(CDX_SERVER_URL))
# TODO: make these automatic
DEFAULT_RULES = 'pywb/rules.yaml'
def test_fuzzy_match():
# Local CDX Server
assert_cdx_fuzzy_match(CDXServer([TEST_CDX_DIR],
ds_rules_file=DEFAULT_RULES))
# Remote CDX Source, Local Filtering
# two calls to remote, first exact with 404,
# then fuzzy with 200
assert_cdx_fuzzy_match(CDXServer(CDX_SERVER_URL,
ds_rules_file=DEFAULT_RULES),
mock_urlopen_fuzzy)
# Remote CDX Query (Remote Filtering)
# fuzzy match handled on remote, single response
assert_cdx_fuzzy_match(RemoteCDXServer(CDX_SERVER_URL,
ds_rules_file=DEFAULT_RULES))
def assert_error(func, exception):
with raises(exception):
func(CDXServer(CDX_SERVER_URL))
with raises(exception):
func(RemoteCDXServer(CDX_SERVER_URL))
def test_err_404():
# Test local for consistency
with raises(NotFoundException):
assert_404(CDXServer([TEST_CDX_DIR]))
assert_error(assert_404, NotFoundException)
def test_err_403():
assert_error(assert_403, AccessException)
def test_err_400():
assert_error(assert_400, BadRequestException)
def test_err_502():
assert_error(assert_502, WbException)

View File

@ -23,7 +23,7 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
""" """
from test_cdxserver import cdx_ops_test from test_cdxops import cdx_ops_test
from pywb import get_test_dir from pywb import get_test_dir