diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 4d28b57e..5d3dc9f4 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -50,7 +50,10 @@ class Route: def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex - self.regex = re.compile(regex + lookahead) + if regex: + self.regex = re.compile(regex + lookahead) + else: + self.regex = re.compile('') self.handler = handler # collection id from regex group (default 0) self.coll_group = coll_group @@ -70,7 +73,6 @@ class Route: return None matched_str = matcher.group(0) - if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 783cf36b..ba5f8b3b 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,6 +1,8 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader +from cdxobject import AccessException + import urllib import urllib2 import itertools @@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource): self.key_prefix = self.DEFAULT_KEY_PREFIX if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - + def load_cdx(self, params): """ diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py index a7b90eb4..ad6ea00d 100644 --- a/pywb/cdx/perms.py +++ b/pywb/cdx/perms.py @@ -1,7 +1,7 @@ #================================================================= -class AllowAllPerms: +class AllowAllPerms(object): """ Sample Perm Checker which allows all """ diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 44483ca4..e5fac6b3 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('offset', '334'), ('filename', 'dupes.warc.gz')] -# NOTE: external dependency -- need self-contained test +# NOTE: external dependency -- need self-contained test TODO >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), @@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] + +>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +Traceback (most recent call last): +AccessException: Blocked By Robots """ #================================================================= diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 9113ad5f..31e7af9a 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import LimitReader #================================================================= class ReplayView: @@ -54,10 +55,21 @@ class ReplayView: response = None + # if Content-Length for payload is present, ensure we don't read past it + content_len = status_headers.get_header('content-length') + try: + content_len=int(content_len) + if content_len > 0: + stream = LimitReader(stream, content_len) + except ValueError: + pass + if self.content_rewriter and wbrequest.wb_url.mod != 'id_': response = self.rewrite_content(wbrequest, cdx, status_headers, stream) else: (status_headers, stream) = self.sanitize_content(status_headers, stream) + #status_headers.remove_header('content-length') + response_iter = self.stream_to_iter(stream) response = WbResponse(status_headers, response_iter) diff --git a/pywb/test/test_archivalrouter.py b/pywb/test/test_archivalrouter.py index 4379fbfd..229fafb6 100644 --- a/pywb/test/test_archivalrouter.py +++ b/pywb/test/test_archivalrouter.py @@ -15,6 +15,13 @@ 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} +# route with no collection +>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False)) +{'coll': '', + 'request_uri': 'http://example.com', + 'wb_prefix': '/pywb/', + 'wb_url': None} + # not matching route -- skipped >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) @@ -67,6 +74,13 @@ False >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') False +# With no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='') +'http://localhost:8080/2013/http://example.com/other.html' + +# With SCRIPT_NAME but no collection +>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='') +'http://localhost:8080/pywb-access/http://example.com/other.html' """ diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index bd21e4ca..73555ca6 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): >>> calc_search_range('http://example.com/path/file.html', 'host', False) ('example.com/', 'example.com0') - # domain range not supported + # errors: domain range not supported >>> calc_search_range('http://example.com/path/file.html', 'domain', False) Traceback (most recent call last): - Exception: matchType=domain unsupported for non-surt + UrlCanonicalizeException: matchType=domain unsupported for non-surt + + >>> calc_search_range('http://example.com/path/file.html', 'blah', False) + Traceback (most recent call last): + UrlCanonicalizeException: Invalid match_type: blah + """ def inc_last_char(x): return x[0:-1] + chr(ord(x[-1]) + 1) @@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise Exception('matchType=domain unsupported for non-surt') + raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') host = start_key.split(')/')[0] @@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): end_key = host + '-' else: - raise Exception('Invalid match_type: ' + match_type) + raise UrlCanonicalizeException('Invalid match_type: ' + match_type) return (start_key, end_key) diff --git a/tests/test_integration.py b/tests/test_integration.py index 1a7a943c..5a165041 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,6 +2,7 @@ import webtest from pywb.pywb_init import pywb_config from pywb.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.perms import AllowAllPerms class TestWb: TEST_CONFIG = 'test_config.yaml' @@ -73,7 +74,19 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:38' in resp.body assert 'wb.js' in resp.body - assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body + assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body + + def test_replay_identity_1(self): + resp = self.testapp.get('/pywb/20140127171251id_/http://example.com') + #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg') + #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css') + #self._assert_basic_html(resp) + + # no wb header insertion + assert 'wb.js' not in resp.body + + # original unrewritten url present + assert '"http://www.iana.org/domains/example"' in resp.body def test_replay_content_length_1(self): # test larger file, rewritten file (svg!) @@ -198,38 +211,21 @@ class TestWb: # Reporter callback for replay view class PrintReporter: def __call__(self, wbrequest, cdx, response): - print wbrequest - print cdx + #print wbrequest + #print cdx pass #================================================================= -class TestExclusionPerms: +class TestExclusionPerms(AllowAllPerms): """ - Sample Perm Checker which allows all + Sample Perm Checker with hard-coded exclusion """ def allow_url_lookup(self, urlkey, url): """ Return true/false if url or urlkey (canonicalized url) should be allowed """ - print urlkey if urlkey == 'org,iana)/_img/bookmark_icon.ico': return False - return True - - def allow_capture(self, cdx): - """ - Return true/false is specified capture (cdx) should be - allowed - """ - return True - - def filter_fields(self, cdx): - """ - Filter out any forbidden cdx fields from cdx dictionary - """ - return cdx - - - + return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)