1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

improve testing and a few fixes:

archivalrouter: support empty collection, with and without SCRIPT_NAME
cdx: remove cdx source test, including access denied
replay: when content-type present, limit the decompressed stream to content-length
(this ensures last 4 bytes in warc/arc record are not read)
integration tests for identity replay
This commit is contained in:
Ilya Kreymer 2014-02-27 18:43:55 -08:00
parent bff39626b5
commit 921b2eb2e1
8 changed files with 67 additions and 32 deletions

View File

@ -50,7 +50,10 @@ class Route:
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
self.regex = re.compile(regex + lookahead)
if regex:
self.regex = re.compile(regex + lookahead)
else:
self.regex = re.compile('')
self.handler = handler
# collection id from regex group (default 0)
self.coll_group = coll_group
@ -70,7 +73,6 @@ class Route:
return None
matched_str = matcher.group(0)
if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri

View File

@ -1,6 +1,8 @@
from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader
from cdxobject import AccessException
import urllib
import urllib2
import itertools
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params):
"""

View File

@ -1,7 +1,7 @@
#=================================================================
class AllowAllPerms:
class AllowAllPerms(object):
"""
Sample Perm Checker which allows all
"""

View File

@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('offset', '334'),
('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test
# NOTE: external dependency -- need self-contained test TODO
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last):
AccessException: Blocked By Robots
"""
#=================================================================

View File

@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader
#=================================================================
class ReplayView:
@ -54,10 +55,21 @@ class ReplayView:
response = None
# if Content-Length for payload is present, ensure we don't read past it
content_len = status_headers.get_header('content-length')
try:
content_len=int(content_len)
if content_len > 0:
stream = LimitReader(stream, content_len)
except ValueError:
pass
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
else:
(status_headers, stream) = self.sanitize_content(status_headers, stream)
#status_headers.remove_header('content-length')
response_iter = self.stream_to_iter(stream)
response = WbResponse(status_headers, response_iter)

View File

@ -15,6 +15,13 @@
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
# route with no collection
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
{'coll': '',
'request_uri': 'http://example.com',
'wb_prefix': '/pywb/',
'wb_url': None}
# not matching route -- skipped
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
@ -67,6 +74,13 @@ False
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
False
# With no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
'http://localhost:8080/2013/http://example.com/other.html'
# With SCRIPT_NAME but no collection
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
'http://localhost:8080/pywb-access/http://example.com/other.html'
"""

View File

@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# domain range not supported
# errors: domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last):
Exception: matchType=domain unsupported for non-surt
UrlCanonicalizeException: matchType=domain unsupported for non-surt
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
Traceback (most recent call last):
UrlCanonicalizeException: Invalid match_type: blah
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
elif match_type == 'domain':
if not surt_ordered:
raise Exception('matchType=domain unsupported for non-surt')
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
host = start_key.split(')/')[0]
@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
end_key = host + '-'
else:
raise Exception('Invalid match_type: ' + match_type)
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
return (start_key, end_key)

View File

@ -2,6 +2,7 @@ import webtest
from pywb.pywb_init import pywb_config
from pywb.wbapp import create_wb_app
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.perms import AllowAllPerms
class TestWb:
TEST_CONFIG = 'test_config.yaml'
@ -73,7 +74,19 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
#self._assert_basic_html(resp)
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
@ -198,38 +211,21 @@ class TestWb:
# Reporter callback for replay view
class PrintReporter:
def __call__(self, wbrequest, cdx, response):
print wbrequest
print cdx
#print wbrequest
#print cdx
pass
#=================================================================
class TestExclusionPerms:
class TestExclusionPerms(AllowAllPerms):
"""
Sample Perm Checker which allows all
Sample Perm Checker with hard-coded exclusion
"""
def allow_url_lookup(self, urlkey, url):
"""
Return true/false if url or urlkey (canonicalized url)
should be allowed
"""
print urlkey
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
return False
return True
def allow_capture(self, cdx):
"""
Return true/false is specified capture (cdx) should be
allowed
"""
return True
def filter_fields(self, cdx):
"""
Filter out any forbidden cdx fields from cdx dictionary
"""
return cdx
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)