mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
improve testing and a few fixes:
archivalrouter: support empty collection, with and without SCRIPT_NAME cdx: remove cdx source test, including access denied replay: when content-type present, limit the decompressed stream to content-length (this ensures last 4 bytes in warc/arc record are not read) integration tests for identity replay
This commit is contained in:
parent
bff39626b5
commit
921b2eb2e1
@ -50,7 +50,10 @@ class Route:
|
||||
|
||||
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||
self.path = regex
|
||||
self.regex = re.compile(regex + lookahead)
|
||||
if regex:
|
||||
self.regex = re.compile(regex + lookahead)
|
||||
else:
|
||||
self.regex = re.compile('')
|
||||
self.handler = handler
|
||||
# collection id from regex group (default 0)
|
||||
self.coll_group = coll_group
|
||||
@ -70,7 +73,6 @@ class Route:
|
||||
return None
|
||||
|
||||
matched_str = matcher.group(0)
|
||||
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
|
@ -1,6 +1,8 @@
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
from cdxobject import AccessException
|
||||
|
||||
import urllib
|
||||
import urllib2
|
||||
import itertools
|
||||
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
|
||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||
if config:
|
||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||
|
||||
|
||||
|
||||
def load_cdx(self, params):
|
||||
"""
|
||||
|
@ -1,7 +1,7 @@
|
||||
|
||||
|
||||
#=================================================================
|
||||
class AllowAllPerms:
|
||||
class AllowAllPerms(object):
|
||||
"""
|
||||
Sample Perm Checker which allows all
|
||||
"""
|
||||
|
@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('offset', '334'),
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test
|
||||
# NOTE: external dependency -- need self-contained test TODO
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
Traceback (most recent call last):
|
||||
AccessException: Blocked By Robots
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
|
@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
|
||||
from wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.utils.loaders import LimitReader
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
@ -54,10 +55,21 @@ class ReplayView:
|
||||
|
||||
response = None
|
||||
|
||||
# if Content-Length for payload is present, ensure we don't read past it
|
||||
content_len = status_headers.get_header('content-length')
|
||||
try:
|
||||
content_len=int(content_len)
|
||||
if content_len > 0:
|
||||
stream = LimitReader(stream, content_len)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
|
||||
response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
|
||||
else:
|
||||
(status_headers, stream) = self.sanitize_content(status_headers, stream)
|
||||
#status_headers.remove_header('content-length')
|
||||
|
||||
response_iter = self.stream_to_iter(stream)
|
||||
response = WbResponse(status_headers, response_iter)
|
||||
|
||||
|
@ -15,6 +15,13 @@
|
||||
'wb_prefix': 'https://localhost:8081/my_pywb/web/',
|
||||
'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}
|
||||
|
||||
# route with no collection
|
||||
>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
|
||||
{'coll': '',
|
||||
'request_uri': 'http://example.com',
|
||||
'wb_prefix': '/pywb/',
|
||||
'wb_url': None}
|
||||
|
||||
# not matching route -- skipped
|
||||
>>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
|
||||
|
||||
@ -67,6 +74,13 @@ False
|
||||
>>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||
False
|
||||
|
||||
# With no collection
|
||||
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
|
||||
'http://localhost:8080/2013/http://example.com/other.html'
|
||||
|
||||
# With SCRIPT_NAME but no collection
|
||||
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
|
||||
'http://localhost:8080/pywb-access/http://example.com/other.html'
|
||||
|
||||
"""
|
||||
|
||||
|
@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||
('example.com/', 'example.com0')
|
||||
|
||||
# domain range not supported
|
||||
# errors: domain range not supported
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||
Traceback (most recent call last):
|
||||
Exception: matchType=domain unsupported for non-surt
|
||||
UrlCanonicalizeException: matchType=domain unsupported for non-surt
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
|
||||
Traceback (most recent call last):
|
||||
UrlCanonicalizeException: Invalid match_type: blah
|
||||
|
||||
"""
|
||||
def inc_last_char(x):
|
||||
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||
@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
|
||||
elif match_type == 'domain':
|
||||
if not surt_ordered:
|
||||
raise Exception('matchType=domain unsupported for non-surt')
|
||||
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
|
||||
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
|
||||
end_key = host + '-'
|
||||
else:
|
||||
raise Exception('Invalid match_type: ' + match_type)
|
||||
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
||||
|
||||
return (start_key, end_key)
|
||||
|
||||
|
@ -2,6 +2,7 @@ import webtest
|
||||
from pywb.pywb_init import pywb_config
|
||||
from pywb.wbapp import create_wb_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.perms import AllowAllPerms
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'test_config.yaml'
|
||||
@ -73,7 +74,19 @@ class TestWb:
|
||||
|
||||
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||
|
||||
def test_replay_identity_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
||||
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
|
||||
#self._assert_basic_html(resp)
|
||||
|
||||
# no wb header insertion
|
||||
assert 'wb.js' not in resp.body
|
||||
|
||||
# original unrewritten url present
|
||||
assert '"http://www.iana.org/domains/example"' in resp.body
|
||||
|
||||
def test_replay_content_length_1(self):
|
||||
# test larger file, rewritten file (svg!)
|
||||
@ -198,38 +211,21 @@ class TestWb:
|
||||
# Reporter callback for replay view
|
||||
class PrintReporter:
|
||||
def __call__(self, wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
#print wbrequest
|
||||
#print cdx
|
||||
pass
|
||||
|
||||
#=================================================================
|
||||
class TestExclusionPerms:
|
||||
class TestExclusionPerms(AllowAllPerms):
|
||||
"""
|
||||
Sample Perm Checker which allows all
|
||||
Sample Perm Checker with hard-coded exclusion
|
||||
"""
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
"""
|
||||
Return true/false if url or urlkey (canonicalized url)
|
||||
should be allowed
|
||||
"""
|
||||
print urlkey
|
||||
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def allow_capture(self, cdx):
|
||||
"""
|
||||
Return true/false is specified capture (cdx) should be
|
||||
allowed
|
||||
"""
|
||||
return True
|
||||
|
||||
def filter_fields(self, cdx):
|
||||
"""
|
||||
Filter out any forbidden cdx fields from cdx dictionary
|
||||
"""
|
||||
return cdx
|
||||
|
||||
|
||||
|
||||
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user