improve testing and a few fixes:

archivalrouter: support empty collection, with and without SCRIPT_NAME cdx: remove cdx source test, including access denied replay: when content-type present, limit the decompressed stream to content-length (this ensures last 4 bytes in warc/arc record are not read) integration tests for identity replay
2025-03-15 00:03:28 +01:00 · 2014-02-27 18:43:55 -08:00 · 2014-02-27 18:43:55 -08:00 · 921b2eb2e1
commit 921b2eb2e1
parent bff39626b5
8 changed files with 67 additions and 32 deletions
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -50,7 +50,10 @@ class Route:

    def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
        self.path = regex
-        self.regex = re.compile(regex + lookahead)
+        if regex:
+            self.regex = re.compile(regex + lookahead)
+        else:
+            self.regex = re.compile('')
        self.handler = handler
        # collection id from regex group (default 0)
        self.coll_group = coll_group
@ -70,7 +73,6 @@ class Route:
            return None

        matched_str = matcher.group(0)
-
        if matched_str:
            rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
            wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,6 +1,8 @@
 from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader

+from cdxobject import AccessException
+
 import urllib
 import urllib2
 import itertools
@ -93,7 +95,7 @@ class RedisCDXSource(CDXSource):
        self.key_prefix = self.DEFAULT_KEY_PREFIX
        if config:
            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
-        
+

    def load_cdx(self, params):
        """
--- a/pywb/cdx/perms.py
+++ b/pywb/cdx/perms.py
@ -1,7 +1,7 @@


 #=================================================================
-class AllowAllPerms:
+class AllowAllPerms(object):
    """
    Sample Perm Checker which allows all
    """
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -141,7 +141,7 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('offset', '334'),
 ('filename', 'dupes.warc.gz')]

-# NOTE: external dependency -- need self-contained test
+# NOTE: external dependency -- need self-contained test TODO
 >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
@ -152,6 +152,10 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
 ('length', '1792')]

+
+>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
+Traceback (most recent call last):
+AccessException: Blocked By Robots
 """

 #=================================================================
--- a/pywb/replay_views.py
+++ b/pywb/replay_views.py
@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
 from wbexceptions import CaptureException, InternalRedirect
 from pywb.warc.recordloader import ArchiveLoadFailed

+from pywb.utils.loaders import LimitReader

 #=================================================================
 class ReplayView:
@ -54,10 +55,21 @@ class ReplayView:

                response = None

+                # if Content-Length for payload is present, ensure we don't read past it
+                content_len = status_headers.get_header('content-length')
+                try:
+                    content_len=int(content_len)
+                    if content_len > 0:
+                        stream = LimitReader(stream, content_len)
+                except ValueError:
+                    pass
+
                if self.content_rewriter and wbrequest.wb_url.mod != 'id_':
                    response = self.rewrite_content(wbrequest, cdx, status_headers, stream)
                else:
                    (status_headers, stream) = self.sanitize_content(status_headers, stream)
+                    #status_headers.remove_header('content-length')
+
                    response_iter = self.stream_to_iter(stream)
                    response = WbResponse(status_headers, response_iter)

--- a/pywb/test/test_archivalrouter.py
+++ b/pywb/test/test_archivalrouter.py
@ -15,6 +15,13 @@
 'wb_prefix': 'https://localhost:8081/my_pywb/web/',
 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')}

+# route with no collection
+>>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False))
+{'coll': '',
+ 'request_uri': 'http://example.com',
+ 'wb_prefix': '/pywb/',
+ 'wb_url': None}
+
 # not matching route -- skipped
 >>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)

@ -67,6 +74,13 @@ False
 >>> _test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
 False

+# With no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/2013/http://example.com/path/page.html', coll='')
+'http://localhost:8080/2013/http://example.com/other.html'
+
+# With SCRIPT_NAME but no collection
+>>> _test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/pywb-access/http://example.com/path/page.html', '/pywb-access', coll='')
+'http://localhost:8080/pywb-access/http://example.com/other.html'

 """

--- a/pywb/utils/canonicalize.py
+++ b/pywb/utils/canonicalize.py
@ -118,10 +118,15 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
    ('example.com/', 'example.com0')

-    # domain range not supported
+    # errors: domain range not supported
    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
    Traceback (most recent call last):
-    Exception: matchType=domain unsupported for non-surt
+    UrlCanonicalizeException: matchType=domain unsupported for non-surt
+
+    >>> calc_search_range('http://example.com/path/file.html', 'blah', False)
+    Traceback (most recent call last):
+    UrlCanonicalizeException: Invalid match_type: blah
+
    """
    def inc_last_char(x):
        return x[0:-1] + chr(ord(x[-1]) + 1)
@ -159,7 +164,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):

    elif match_type == 'domain':
        if not surt_ordered:
-            raise Exception('matchType=domain unsupported for non-surt')
+            raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')

        host = start_key.split(')/')[0]

@ -172,7 +177,7 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):

        end_key = host + '-'
    else:
-        raise Exception('Invalid match_type: ' + match_type)
+        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)

    return (start_key, end_key)

--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -2,6 +2,7 @@ import webtest
 from pywb.pywb_init import pywb_config
 from pywb.wbapp import create_wb_app
 from pywb.cdx.cdxobject import CDXObject
+from pywb.cdx.perms import AllowAllPerms

 class TestWb:
    TEST_CONFIG = 'test_config.yaml'
@ -73,7 +74,19 @@ class TestWb:

        assert 'Mon, Jan 27 2014 17:12:38' in resp.body
        assert 'wb.js' in resp.body
-        assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
+        assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
+
+    def test_replay_identity_1(self):
+        resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
+        #resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
+        #resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
+        #self._assert_basic_html(resp)
+
+        # no wb header insertion
+        assert 'wb.js' not in resp.body
+
+        # original unrewritten url present
+        assert '"http://www.iana.org/domains/example"' in resp.body

    def test_replay_content_length_1(self):
        # test larger file, rewritten file (svg!)
@ -198,38 +211,21 @@ class TestWb:
 # Reporter callback for replay view
 class PrintReporter:
    def __call__(self, wbrequest, cdx, response):
-        print wbrequest
-        print cdx
+        #print wbrequest
+        #print cdx
        pass

 #=================================================================
-class TestExclusionPerms:
+class TestExclusionPerms(AllowAllPerms):
    """
-    Sample Perm Checker which allows all
+    Sample Perm Checker with hard-coded exclusion
    """
    def allow_url_lookup(self, urlkey, url):
        """
        Return true/false if url or urlkey (canonicalized url)
        should be allowed
        """
-        print urlkey
        if urlkey == 'org,iana)/_img/bookmark_icon.ico':
            return False

-        return True
-
-    def allow_capture(self, cdx):
-        """
-        Return true/false is specified capture (cdx) should be
-        allowed
-        """
-        return True
-
-    def filter_fields(self, cdx):
-        """
-        Filter out any forbidden cdx fields from cdx dictionary
-        """
-        return cdx
-
-
-
+        return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)