mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
improve testing and a few fixes:
archivalrouter: support empty collection, with and without SCRIPT_NAME cdx: remove cdx source test, including access denied replay: when content-type present, limit the decompressed stream to content-length (this ensures last 4 bytes in warc/arc record are not read) integration tests for identity replay
This commit is contained in:
parent
bff39626b5
commit
921b2eb2e1
@ -50,7 +50,10 @@ class Route:
|
|||||||
|
|
||||||
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||||
self.path = regex
|
self.path = regex
|
||||||
self.regex = re.compile(regex + lookahead)
|
if regex:
|
||||||
|
self.regex = re.compile(regex + lookahead)
|
||||||
|
else:
|
||||||
|
self.regex = re.compile('')
|
||||||
self.handler = handler
|
self.handler = handler
|
||||||
# collection id from regex group (default 0)
|
# collection id from regex group (default 0)
|
||||||
self.coll_group = coll_group
|
self.coll_group = coll_group
|
||||||
@ -70,7 +73,6 @@ class Route:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
matched_str = matcher.group(0)
|
matched_str = matcher.group(0)
|
||||||
|
|
||||||
if matched_str:
|
if matched_str:
|
||||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
|
from cdxobject import AccessException
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
import itertools
|
import itertools
|
||||||
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
|
|||||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||||
if config:
|
if config:
|
||||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||||
|
|
||||||
|
|
||||||
def load_cdx(self, params):
|
def load_cdx(self, params):
|
||||||
"""
|
"""
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class AllowAllPerms:
|
class AllowAllPerms(object):
|
||||||
"""
|
"""
|
||||||
Sample Perm Checker which allows all
|
Sample Perm Checker which allows all
|
||||||
"""
|
"""
|
||||||
|
@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('offset', '334'),
|
('offset', '334'),
|
||||||
('filename', 'dupes.warc.gz')]
|
('filename', 'dupes.warc.gz')]
|
||||||
|
|
||||||
# NOTE: external dependency -- need self-contained test
|
# NOTE: external dependency -- need self-contained test TODO
|
||||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||||
>>> pprint.pprint(x.next().items())
|
>>> pprint.pprint(x.next().items())
|
||||||
[('urlkey', 'com,example)/'),
|
[('urlkey', 'com,example)/'),
|
||||||
@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||||
('length', '1792')]
|
('length', '1792')]
|
||||||
|
|
||||||
|
|
||||||
|
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||||
|
Traceback (most recent call last):
|
||||||
|
AccessException: Blocked By Robots
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
|
|||||||
from wbexceptions import CaptureException, InternalRedirect
|
from wbexceptions import CaptureException, InternalRedirect
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
|
from pywb.utils.loaders import LimitReader
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayView:
|
class ReplayView:
|
||||||
@ -54,10 +55,21 @@ class ReplayView:
|
|||||||
|
|
||||||
response = None
|
response = None
|
||||||
|
|
||||||
|
# if Content-Length for payload is present, ensure we don't read past it
|
||||||
|
content_len = status_headers.get_header('content-length')
|
||||||
|
try:
|
||||||
|
content_len=int(content_len)
|
||||||
|
if content_len > 0:
|
||||||
|
stream = LimitReader(stream, content_len)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||||
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
||||||
else:
|
else:
|
||||||
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
||||||
|
#status_headers.remove_header('content-length')
|
||||||
|
|
||||||
response_iter = self.stream_to_iter(stream)
|
response_iter = self.stream_to_iter(stream)
|
||||||
response = WbResponse(status_headers, response_iter)
|
response = WbResponse(status_headers, response_iter)
|
||||||
|
|
||||||
|
@ -15,6 +15,13 @@
|
|||||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||||
|
|
||||||
|
# route with no collection
|
||||||
|
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
|
||||||
|
{'coll': '',
|
||||||
|
'request_uri': 'http://example.com',
|
||||||
|
'wb_prefix': '/pywb/',
|
||||||
|
'wb_url': None}
|
||||||
|
|
||||||
# not matching route -- skipped
|
# not matching route -- skipped
|
||||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||||
|
|
||||||
@ -67,6 +74,13 @@ False
|
|||||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||||
False
|
False
|
||||||
|
|
||||||
|
# With no collection
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
|
||||||
|
'http://localhost:8080/2013/http://example.com/other.html'
|
||||||
|
|
||||||
|
# With SCRIPT_NAME but no collection
|
||||||
|
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
|
||||||
|
'http://localhost:8080/pywb-access/http://example.com/other.html'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||||
('example.com/', 'example.com0')
|
('example.com/', 'example.com0')
|
||||||
|
|
||||||
# domain range not supported
|
# errors: domain range not supported
|
||||||
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: matchType=domain unsupported for non-surt
|
UrlCanonicalizeException: matchType=domain unsupported for non-surt
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
UrlCanonicalizeException: Invalid match_type: blah
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def inc_last_char(x):
|
def inc_last_char(x):
|
||||||
return x[0:-1] + chr(ord(x[-1]) + 1)
|
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||||
@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
elif match_type == 'domain':
|
elif match_type == 'domain':
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
raise Exception('matchType=domain unsupported for non-surt')
|
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
|
||||||
|
|
||||||
host = start_key.split(')/')[0]
|
host = start_key.split(')/')[0]
|
||||||
|
|
||||||
@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
end_key = host + '-'
|
end_key = host + '-'
|
||||||
else:
|
else:
|
||||||
raise Exception('Invalid match_type: ' + match_type)
|
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
||||||
|
|
||||||
return (start_key, end_key)
|
return (start_key, end_key)
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ import webtest
|
|||||||
from pywb.pywb_init import pywb_config
|
from pywb.pywb_init import pywb_config
|
||||||
from pywb.wbapp import create_wb_app
|
from pywb.wbapp import create_wb_app
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
from pywb.cdx.perms import AllowAllPerms
|
||||||
|
|
||||||
class TestWb:
|
class TestWb:
|
||||||
TEST_CONFIG = 'test_config.yaml'
|
TEST_CONFIG = 'test_config.yaml'
|
||||||
@ -73,7 +74,19 @@ class TestWb:
|
|||||||
|
|
||||||
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
|
def test_replay_identity_1(self):
|
||||||
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||||
|
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||||
|
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
|
||||||
|
#self._assert_basic_html(resp)
|
||||||
|
|
||||||
|
# no wb header insertion
|
||||||
|
assert 'wb.js' not in resp.body
|
||||||
|
|
||||||
|
# original unrewritten url present
|
||||||
|
assert '"http://www.iana.org/domains/example"' in resp.body
|
||||||
|
|
||||||
def test_replay_content_length_1(self):
|
def test_replay_content_length_1(self):
|
||||||
# test larger file, rewritten file (svg!)
|
# test larger file, rewritten file (svg!)
|
||||||
@ -198,38 +211,21 @@ class TestWb:
|
|||||||
# Reporter callback for replay view
|
# Reporter callback for replay view
|
||||||
class PrintReporter:
|
class PrintReporter:
|
||||||
def __call__(self, wbrequest, cdx, response):
|
def __call__(self, wbrequest, cdx, response):
|
||||||
print wbrequest
|
#print wbrequest
|
||||||
print cdx
|
#print cdx
|
||||||
pass
|
pass
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class TestExclusionPerms:
|
class TestExclusionPerms(AllowAllPerms):
|
||||||
"""
|
"""
|
||||||
Sample Perm Checker which allows all
|
Sample Perm Checker with hard-coded exclusion
|
||||||
"""
|
"""
|
||||||
def allow_url_lookup(self, urlkey, url):
|
def allow_url_lookup(self, urlkey, url):
|
||||||
"""
|
"""
|
||||||
Return true/false if url or urlkey (canonicalized url)
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
should be allowed
|
should be allowed
|
||||||
"""
|
"""
|
||||||
print urlkey
|
|
||||||
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
||||||
|
|
||||||
def allow_capture(self, cdx):
|
|
||||||
"""
|
|
||||||
Return true/false is specified capture (cdx) should be
|
|
||||||
allowed
|
|
||||||
"""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def filter_fields(self, cdx):
|
|
||||||
"""
|
|
||||||
Filter out any forbidden cdx fields from cdx dictionary
|
|
||||||
"""
|
|
||||||
return cdx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user