- support for allow/block/exclude access controls (as per ukwa/ukwa-pywb#7)

- .aclj files contain access controls in reverse sorted, CDXJ-like format - ./sample_archive/acl contains sample acl files - directory and single-file acl sources (extend directory aggregator and file index source) - tests for longest-prefix acl match - tests for acl applied to collection - pywb.utils.merge -- merge(..., reverse=True) support for py2.7 (backported from py3.5) - acl types: * allow - all allowed * block - allowed in index (as blocked) but content not allowed, served as 451 * exclude - removed from index and content, served as 404 - warcserver: AccessChecker inited if 'acl_paths' specified in custom collections - exceptions: * clean up wbexception, subclasses provide the status code, message loaded automatically * warcserver handles AccessException with json response (now with 451 status) * pass status to template to allow custom handling
2025-03-15 00:03:28 +01:00 · 2018-02-17 21:59:30 -08:00 · 2018-02-17 21:59:30 -08:00 · 77eefcdce6
commit 77eefcdce6
parent 5b7ca18e0f
23 changed files with 562 additions and 59 deletions
--- a/pywb/apps/frontendapp.py
+++ b/pywb/apps/frontendapp.py
@ -17,13 +17,14 @@ from pywb.recorder.recorderapp import RecorderApp
 from pywb.utils.loaders import load_yaml_config
 from pywb.utils.geventserver import GeventServer
 from pywb.utils.io import StreamIter
 from pywb.utils.wbexception import NotFoundException, WbException
 from pywb.warcserver.warcserver import WarcServer
 from pywb.rewrite.templateview import BaseInsertView
 from pywb.apps.static_handler import StaticHandler
-from pywb.apps.rewriterapp import RewriterApp, UpstreamException
+from pywb.apps.rewriterapp import RewriterApp
 from pywb.apps.wbrequestresponse import WbResponse
 import os
@ -382,8 +383,8 @@ class FrontEndApp(object):
            wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '')
        try:
            response = self.rewriterapp.render_content(wb_url_str, metadata, environ)
-        except UpstreamException as ue:
+        except WbException as wbe:
-            response = self.rewriterapp.handle_error(environ, ue)
+            response = self.rewriterapp.handle_error(environ, wbe)
            raise HTTPException(response=response)
        return response
@ -446,7 +447,7 @@ class FrontEndApp(object):
        :param dict environ: The WSGI environment dictionary for the request
        :param str msg: The error message
        """
-        raise NotFound(response=self.rewriterapp._error_response(environ, msg))
+        raise NotFound(response=self.rewriterapp._error_response(environ, NotFoundException(msg)))
    def _check_refer_redirect(self, environ):
        """Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header
@ -513,7 +514,7 @@ class FrontEndApp(object):
            if self.debug:
                traceback.print_exc()
-            response = self.rewriterapp._error_response(environ, 'Internal Error: ' + str(e), '500 Server Error')
+            response = self.rewriterapp._error_response(environ, WbException('Internal Error: ' + str(e)))
            return response(environ, start_response)
    @classmethod
--- a/pywb/apps/rewriterapp.py
+++ b/pywb/apps/rewriterapp.py
@ -27,7 +27,11 @@ from pywb.warcserver.index.cdxobject import CDXObject
 class UpstreamException(WbException):
    def __init__(self, status_code, url, details):
        super(UpstreamException, self).__init__(url=url, msg=details)
-        self.status_code = status_code
+        self._status_code = status_code
    @property
    def status_code(self):
        return self._status_code
 # ============================================================================
@ -502,24 +506,24 @@ class RewriterApp(object):
        top_url += wb_url.to_str(mod='')
        return top_url
-    def handle_error(self, environ, ue):
+    def handle_error(self, environ, wbe):
-        if ue.status_code == 404:
+        if wbe.status_code == 404:
-            return self._not_found_response(environ, ue.url)
+            return self._not_found_response(environ, wbe.url)
        else:
-            status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error')
+            return self._error_response(environ, wbe)
            return self._error_response(environ, ue.url, ue.msg,
                                        status=status)
    def _not_found_response(self, environ, url):
        resp = self.not_found_view.render_to_string(environ, url=url)
        return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')
-    def _error_response(self, environ, msg='', details='', status='404 Not Found'):
+    def _error_response(self, environ, wbe):
        status = wbe.status()
        resp = self.error_view.render_to_string(environ,
-                                                err_msg=msg,
+                                                err_msg=wbe.url,
-                                                err_details=details)
+                                                err_details=wbe.msg,
                                                err_status=wbe.status_code)
        return WbResponse.text_response(resp, status=status, content_type='text/html')
--- a/pywb/default_config.yaml
+++ b/pywb/default_config.yaml
@ -3,8 +3,11 @@ collections_root: collections
 # Per-Collection Paths
 archive_paths: archive
 index_paths: indexes
 acl_paths: acl
 static_path: static
 default_access: allow
 templates_dir: templates
 # Template HTML
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -89,9 +89,13 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
            matched = True
            break
-    # no matches, so return empty iterator
+    # no matches, so pop last line, but return rest of prev lines, if any
    if not matched:
-        return iter([])
+        if not prev_size or len(prev_deque) <= 1:
            return iter([])
        prev_deque.popleft()
        return iter(prev_deque)
    return itertools.chain(prev_deque, iter_)
--- a/pywb/utils/merge.py
+++ b/pywb/utils/merge.py
@ -0,0 +1,112 @@
 import sys
 if sys.version_info >= (3, 5):  #pragma: no cover
    from heapq import merge
 else:  #pragma: no cover
    # ported from python 3.5 heapq merge with reverse=True support
    from heapq import heapify, heappop, heapreplace
    from heapq import _heapify_max, _siftup_max
    def _heappop_max(heap):
        """Maxheap version of a heappop."""
        lastelt = heap.pop()    # raises appropriate IndexError if heap is empty
        if heap:
            returnitem = heap[0]
            heap[0] = lastelt
            _siftup_max(heap, 0)
            return returnitem
        return lastelt
    def _heapreplace_max(heap, item):
        """Maxheap version of a heappop followed by a heappush."""
        returnitem = heap[0]    # raises appropriate IndexError if heap is empty
        heap[0] = item
        _siftup_max(heap, 0)
        return returnitem
    def _get_next_iter(it):
        return it.__next__ if hasattr(it, '__next__') else it.next
    def merge(*iterables, **kwargs):
        '''Merge multiple sorted inputs into a single sorted output.
        Similar to sorted(itertools.chain(*iterables)) but returns a generator,
        does not pull the data into memory all at once, and assumes that each of
        the input streams is already sorted (smallest to largest).
        >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
        [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
        If *key* is not None, applies a key function to each element to determine
        its sort order.
        >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
        ['dog', 'cat', 'fish', 'horse', 'kangaroo']
        '''
        key = kwargs.get('key', None)
        reverse = kwargs.get('reverse', False)
        h = []
        h_append = h.append
        if reverse:
            _heapify = _heapify_max
            _heappop = _heappop_max
            _heapreplace = _heapreplace_max
            direction = -1
        else:
            _heapify = heapify
            _heappop = heappop
            _heapreplace = heapreplace
            direction = 1
        if key is None:
            for order, it in enumerate(map(iter, iterables)):
                try:
                    next = _get_next_iter(it)
                    h_append([next(), order * direction, next])
                except StopIteration:
                    pass
            _heapify(h)
            while len(h) > 1:
                try:
                    while True:
                        value, order, next = s = h[0]
                        yield value
                        s[0] = next()           # raises StopIteration when exhausted
                        _heapreplace(h, s)      # restore heap condition
                except StopIteration:
                    _heappop(h)                 # remove empty iterator
            if h:
                # fast case when only a single iterator remains
                value, order, next = h[0]
                yield value
                for v in next.__self__:
                    yield v
            return
        for order, it in enumerate(map(iter, iterables)):
            try:
                next = _get_next_iter(it)
                value = next()
                h_append([key(value), order * direction, value, next])
            except StopIteration:
                pass
        _heapify(h)
        while len(h) > 1:
            try:
                while True:
                    key_value, order, value, next = s = h[0]
                    yield value
                    value = next()
                    s[0] = key(value)
                    s[2] = value
                    _heapreplace(h, s)
            except StopIteration:
                _heappop(h)
        if h:
            key_value, order, value, next = h[0]
            yield value
            for v in next.__self__:
                yield v
--- a/pywb/utils/test/test_binsearch.py
+++ b/pywb/utils/test/test_binsearch.py
@ -23,6 +23,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
 >>> print_binsearch_results_range('org,iana)/time-zones', 'org,iana)/time-zones!', iter_range)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
 # Exact search -- no matches
 >>> print_binsearch_results('org,iaana)/', iter_exact)
 >>> print_binsearch_results('org,ibna)/', iter_exact)
@ -74,6 +77,7 @@ org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200
 #=================================================================
 import os
 from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
 from pywb.utils.merge import merge
 from pywb import get_test_dir
@ -91,6 +95,22 @@ def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
            print(line.decode('utf-8'))
 def test_rev_merge():
    with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
        lines1 = cdx.readlines()
    with open(test_cdx_dir + 'dupes.cdx', 'rb') as cdx:
        lines2 = cdx.readlines()
    # check reverse merge: verify merging of lists, than reversing
    # eqauls merging with reverse=True of reversed lists
    assert (list(reversed(list(merge(lines1, lines2)))) ==
            list(merge(reversed(lines1), reversed(lines2), reverse=True)))
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/utils/wbexception.py
+++ b/pywb/utils/wbexception.py
@ -1,3 +1,4 @@
 from werkzeug.http import HTTP_STATUS_CODES
 #=================================================================
@ -7,6 +8,13 @@ class WbException(Exception):
        self.msg = msg
        self.url = url
    @property
    def status_code(self):
        return 500
    def status(self):
        return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
    def __repr__(self):
        return "{0}('{1}',)".format(self.__class__.__name__,  self.msg)
@ -17,25 +25,28 @@ class WbException(Exception):
 #=================================================================
 class AccessException(WbException):
-    def status(self):
+    @property
-        return '403 Access Denied'
+    def status_code(self):
        return 451
 #=================================================================
 class BadRequestException(WbException):
-    def status(self):
+    @property
-        return '400 Bad Request'
+    def status_code(self):
        return 400
 #=================================================================
 class NotFoundException(WbException):
-    def status(self):
+    @property
-        return '404 Not Found'
+    def status_code(self):
        return 404
 #=================================================================
 class LiveResourceException(WbException):
-    def status(self):
+    @property
-        return '400 Bad Live Resource'
+    def status_code(self):
-
+        return 400
--- a/pywb/warcserver/access_checker.py
+++ b/pywb/warcserver/access_checker.py
@ -0,0 +1,91 @@
 from pywb.warcserver.index.indexsource import FileIndexSource
 from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
 from pywb.utils.binsearch import search
 from pywb.utils.merge import merge
 import os
 # ============================================================================
 class FileAccessIndexSource(FileIndexSource):
    @staticmethod
    def rev_cmp(a, b):
        return (a < b) - (a > b)
    def _get_gen(self, fh, params):
        return search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp)
 # ============================================================================
 class DirectoryAccessSource(DirectoryIndexSource):
    INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
    def _merge(self, iter_list):
        return merge(*(iter_list), reverse=True)
 # ============================================================================
 class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
    pass
 # ============================================================================
 class AccessChecker(object):
    def __init__(self, access_source_file, default_access='allow'):
        if isinstance(access_source_file, str):
            self.aggregator = self.create_access_aggregator(access_source_file)
        else:
            self.aggregator = access_source_file
        self.default_rule = {'urlkey': '', 'access': default_access}
    def create_access_aggregator(self, filename):
        if os.path.isdir(filename):
            return CacheDirectoryAccessSource(filename)
        elif os.path.isfile(filename):
            return FileAccessIndexSource(filename)
        else:
            raise Exception('Invalid Access Source: ' + filename)
    def find_access_rule(self, url, ts=None, urlkey=None):
        params = {'url': url, 'urlkey': urlkey}
        cdx_iter, errs = self.aggregator(params)
        if errs:
            print(errs)
        key = params['key'].decode('utf-8')
        for cdx in cdx_iter:
            if 'urlkey' not in cdx:
                continue
            if key.startswith(cdx['urlkey']):
                return cdx
        return self.default_rule
    def __call__(self, res):
        cdx_iter, errs = res
        return self.wrap_iter(cdx_iter), errs
    def wrap_iter(self, cdx_iter):
        last_rule = None
        last_url = None
        for cdx in cdx_iter:
            url = cdx.get('url')
            # if no url, possible idx or other object, don't apply any checks and pass through
            if not url:
                yield cdx
                continue
            rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'))
            access = rule.get('access', 'exclude')
            if access == 'exclude':
                continue
            cdx['access'] = access
            yield cdx
--- a/pywb/warcserver/basewarcserver.py
+++ b/pywb/warcserver/basewarcserver.py
@ -1,6 +1,8 @@
 from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
 from pywb.utils.format import query_to_dict
 from pywb.utils.wbexception import AccessException
 from werkzeug.routing import Map, Rule
 from werkzeug.exceptions import HTTPException
@ -90,6 +92,12 @@ class BaseWarcServer(object):
            start_response('200 OK', list(out_headers.items()))
            return res
        except AccessException as ae:
            out_headers = {}
            res = self.json_encode(ae.msg, out_headers)
            start_response(ae.status(), list(out_headers.items()))
            return res
        except Exception as e:
            if self.debug:
                traceback.print_exc()
@ -107,6 +115,7 @@ class BaseWarcServer(object):
    def send_error(self, errs, start_response,
                   message='No Resource Found', status=404):
        last_exc = errs.pop('last_exc', None)
        if last_exc:
            if self.debug:
--- a/pywb/warcserver/handlers.py
+++ b/pywb/warcserver/handlers.py
@ -1,4 +1,4 @@
-from pywb.utils.wbexception import BadRequestException, WbException
+from pywb.utils.wbexception import BadRequestException, WbException, AccessException
 from pywb.utils.wbexception import NotFoundException
 from pywb.utils.memento import MementoUtils
@ -48,6 +48,7 @@ class IndexHandler(object):
        self.index_source = index_source
        self.opts = opts or {}
        self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
        self.access_checker = kwargs.get('access_checker')
    def get_supported_modes(self):
        return dict(modes=['list_sources', 'index'])
@ -62,7 +63,12 @@ class IndexHandler(object):
        if input_req:
            params['alt_url'] = input_req.include_method_query(url)
-        return self.fuzzy(self.index_source, params)
+        cdx_iter = self.fuzzy(self.index_source, params)
        if self.access_checker:
            cdx_iter = self.access_checker(cdx_iter)
        return cdx_iter
    def __call__(self, params):
        mode = params.get('mode', 'index')
@ -101,8 +107,8 @@ class IndexHandler(object):
 #=============================================================================
 class ResourceHandler(IndexHandler):
-    def __init__(self, index_source, resource_loaders, rules_file=None):
+    def __init__(self, index_source, resource_loaders, **kwargs):
-        super(ResourceHandler, self).__init__(index_source, rules_file=rules_file)
+        super(ResourceHandler, self).__init__(index_source, **kwargs)
        self.resource_loaders = resource_loaders
    def get_supported_modes(self):
@ -121,6 +127,11 @@ class ResourceHandler(IndexHandler):
        last_exc = None
        for cdx in cdx_iter:
            if cdx.get('access', 'allow') != 'allow':
                raise AccessException(msg={'access': cdx['access'],
                                           'access_status': cdx.get('access_status', 451)},
                                      url=cdx['url'])
            for loader in self.resource_loaders:
                try:
                    out_headers, resp = loader(cdx, params)
@ -141,13 +152,12 @@ class ResourceHandler(IndexHandler):
 #=============================================================================
 class DefaultResourceHandler(ResourceHandler):
    def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
-                 rules_file=''):
+                 **kwargs):
        loaders = [WARCPathLoader(warc_paths, index_source),
                   LiveWebLoader(forward_proxy_prefix),
                   VideoLoader()
                  ]
-        super(DefaultResourceHandler, self).__init__(index_source, loaders,
+        super(DefaultResourceHandler, self).__init__(index_source, loaders, **kwargs)
                                                     rules_file=rules_file)
 #=============================================================================
--- a/pywb/warcserver/index/aggregator.py
+++ b/pywb/warcserver/index/aggregator.py
@ -90,10 +90,13 @@ class BaseAggregator(object):
        if len(iter_list) <= 1:
            cdx_iter = iter_list[0] if iter_list else iter([])
        else:
-            cdx_iter = merge(*(iter_list))
+            cdx_iter = self._merge(iter_list)
        return cdx_iter, err_list
    def _merge(self, iter_list):
        return merge(*(iter_list))
    def _on_source_error(self, name):  #pragma: no cover
        pass
@ -257,6 +260,11 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
 #=============================================================================
 class BaseDirectoryIndexSource(BaseAggregator):
    INDEX_SOURCES = [
                     (FileIndexSource.CDX_EXT, FileIndexSource),
                     (ZipNumIndexSource.IDX_EXT, ZipNumIndexSource)
                    ]
    def __init__(self, base_prefix, base_dir='', name='', config=None):
        self.base_prefix = base_prefix
        self.base_dir = base_dir
@ -280,13 +288,13 @@ class BaseDirectoryIndexSource(BaseAggregator):
    def _load_files_single_dir(self, the_dir):
        for name in os.listdir(the_dir):
-            filename = os.path.join(the_dir, name)
+            for ext, cls in self.INDEX_SOURCES:
                if not name.endswith(ext):
                    continue
-            is_cdx = filename.endswith(FileIndexSource.CDX_EXT)
+                filename = os.path.join(the_dir, name)
            is_zip = filename.endswith(ZipNumIndexSource.IDX_EXT)
-            if is_cdx or is_zip:
+                 #print('Adding ' + filename)
                #print('Adding ' + filename)
                rel_path = os.path.relpath(the_dir, self.base_prefix)
                if rel_path == '.':
                    full_name = name
@ -296,10 +304,7 @@ class BaseDirectoryIndexSource(BaseAggregator):
                if self.name:
                    full_name = self.name + ':' + full_name
-                if is_cdx:
+                index_src = cls(filename, self.config)
                    index_src = FileIndexSource(filename)
                else:
                    index_src = ZipNumIndexSource(filename, self.config)
                yield full_name, index_src
@ -341,9 +346,9 @@ class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
 #=============================================================================
-class CacheDirectoryIndexSource(DirectoryIndexSource):
+class CacheDirectoryMixin(object):
    def __init__(self, *args, **kwargs):
-        super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs)
+        super(CacheDirectoryMixin, self).__init__(*args, **kwargs)
        self.cached_file_list = {}
    def _load_files_single_dir(self, the_dir):
@ -360,12 +365,17 @@ class CacheDirectoryIndexSource(DirectoryIndexSource):
                print('Dir {0} unchanged'.format(the_dir))
                return files
-        files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir)
+        files = super(CacheDirectoryMixin, self)._load_files_single_dir(the_dir)
        files = list(files)
        self.cached_file_list[the_dir] = (stat, files)
        return files
 #=============================================================================
 class CacheDirectoryIndexSource(CacheDirectoryMixin, DirectoryIndexSource):
    pass
 #=============================================================================
 class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource):
    def _iter_sources(self, params):
--- a/pywb/warcserver/index/cdxobject.py
+++ b/pywb/warcserver/index/cdxobject.py
@ -36,8 +36,9 @@ ORIG_FILENAME = 'orig.filename'
 #=================================================================
 class CDXException(WbException):
-    def status(self):
+    @property
-        return '400 Bad Request'
+    def status_code(self):
        return 400
 #=================================================================
@ -132,7 +133,7 @@ class CDXObject(OrderedDict):
                        v = quote(v.encode('utf-8'), safe=':/')
                if n != 'filename':
-                    v = to_native_str(v, 'utf-8')
+                    v = to_native_str(v, 'utf-8') or v
                self[n] = v
--- a/pywb/warcserver/index/indexsource.py
+++ b/pywb/warcserver/index/indexsource.py
@ -67,21 +67,26 @@ class BaseIndexSource(object):
 class FileIndexSource(BaseIndexSource):
    CDX_EXT = ('.cdx', '.cdxj')
-    def __init__(self, filename):
+    def __init__(self, filename, config=None):
        self.filename_template = filename
    def _do_open(self, filename):
        try:
            return open(filename, 'rb')
        except IOError:
            raise NotFoundException(filename)
    def _get_gen(self, fh, params):
        return iter_range(fh, params['key'], params['end_key'])
    def load_index(self, params):
        filename = res_template(self.filename_template, params)
-        try:
+        fh = self._do_open(filename)
            fh = open(filename, 'rb')
        except IOError:
            raise NotFoundException(filename)
        def do_load(fh):
            with fh:
-                gen = iter_range(fh, params['key'], params['end_key'])
+                for line in self._get_gen(fh, params):
                for line in gen:
                    yield CDXObject(line)
        return do_load(fh)
--- a/pywb/warcserver/test/test_access.py
+++ b/pywb/warcserver/test/test_access.py
@ -0,0 +1,116 @@
 from mock import patch
 import shutil
 import os
 from pywb.warcserver.index.aggregator import SimpleAggregator
 from pywb.warcserver.access_checker import FileAccessIndexSource, AccessChecker, DirectoryAccessSource
 from pywb.warcserver.test.testutils import to_path, TempDirTests, BaseTestClass
 from pywb import get_test_dir
 TEST_EXCL_PATH = to_path(get_test_dir() + '/access/')
 # ============================================================================
 class TestAccess(TempDirTests, BaseTestClass):
    def test_allows_only_default_block(self):
        agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'allows.aclj')})
        access = AccessChecker(agg, default_access='block')
        edx = access.find_access_rule('http://example.net')
        assert edx['urlkey'] == 'net,'
        edx = access.find_access_rule('http://foo.example.net/abc')
        assert edx['urlkey'] == 'net,'
        edx = access.find_access_rule('https://example.net/test/')
        assert edx['urlkey'] == 'net,example)/test'
        edx = access.find_access_rule('https://example.org/')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
        edx = access.find_access_rule('https://abc.domain.net/path')
        assert edx['urlkey'] == 'net,domain,'
        edx = access.find_access_rule('https://domain.neta/path')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
    def test_blocks_only(self):
        agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'blocks.aclj')})
        access = AccessChecker(agg)
        edx = access.find_access_rule('https://example.com/foo')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('https://example.com/food')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('https://example.com/foo/path')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('https://example.net/abc/path/other')
        assert edx['urlkey'] == 'net,example)/abc/path'
        assert edx['access'] == 'block'
        edx = access.find_access_rule('https://example.net/fo')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'allow'
    def test_single_file_combined(self):
        agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'list1.aclj')})
        access = AccessChecker(agg, default_access='block')
        edx = access.find_access_rule('http://example.com/abc/page.html')
        assert edx['urlkey'] == 'com,example)/abc/page.html'
        assert edx['access'] == 'allow'
        edx = access.find_access_rule('http://example.com/abc/page.htm')
        assert edx['urlkey'] == 'com,example)/abc'
        assert edx['access'] == 'block'
        edx = access.find_access_rule('http://example.com/abc/')
        assert edx['urlkey'] == 'com,example)/abc'
        assert edx['access'] == 'block'
        edx = access.find_access_rule('http://foo.example.com/')
        assert edx['urlkey'] == 'com,example,'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('http://example.com/')
        assert edx['urlkey'] == 'com,'
        assert edx['access'] == 'allow'
        edx = access.find_access_rule('foo.net')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
        edx = access.find_access_rule('https://example.net/abc/path/other')
        assert edx['urlkey'] == ''
        assert edx['access'] == 'block'
    def test_excludes_dir(self):
        agg = DirectoryAccessSource(TEST_EXCL_PATH)
        access = AccessChecker(agg, default_access='block')
        edx = access.find_access_rule('http://example.com/')
        assert edx['urlkey'] == 'com,example)/'
        assert edx['access'] == 'allow'
        edx = access.find_access_rule('http://example.bo')
        assert edx['urlkey'] == 'bo,example)/'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('https://example.com/foo/path')
        assert edx['urlkey'] == 'com,example)/foo'
        assert edx['access'] == 'exclude'
        edx = access.find_access_rule('https://example.net/abc/path/other')
        assert edx['urlkey'] == 'net,example)/abc/path'
        assert edx['access'] == 'block'
--- a/pywb/warcserver/test/test_warcserver_config.yaml
+++ b/pywb/warcserver/test/test_warcserver_config.yaml
@ -1,3 +1,5 @@
 debug: true
 collections:
    # Live Index
--- a/pywb/warcserver/warcserver.py
+++ b/pywb/warcserver/warcserver.py
@ -14,6 +14,8 @@ from pywb.warcserver.index.indexsource import XmlQueryIndexSource
 from pywb.warcserver.index.zipnum import ZipNumIndexSource
 from pywb.warcserver.access_checker import AccessChecker, CacheDirectoryAccessSource
 from pywb import DEFAULT_CONFIG
 from six import iteritems, iterkeys, itervalues
@ -60,6 +62,9 @@ class WarcServer(BaseWarcServer):
        self.root_dir = self.config.get('collections_root', '')
        self.index_paths = self.init_paths('index_paths')
        self.archive_paths = self.init_paths('archive_paths', self.root_dir)
        self.acl_paths = self.init_paths('acl_paths')
        self.default_access = self.config.get('default_access')
        self.rules_file = self.config.get('rules_file', '')
@ -103,8 +108,12 @@ class WarcServer(BaseWarcServer):
                                               base_dir=self.index_paths,
                                               config=self.config)
        access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
                                       self.default_access)
        return DefaultResourceHandler(dir_source, self.archive_paths,
-                                      rules_file=self.rules_file)
+                                      rules_file=self.rules_file,
                                      access_checker=access_checker)
    def list_fixed_routes(self):
        return list(self.fixed_routes.keys())
@ -156,11 +165,15 @@ class WarcServer(BaseWarcServer):
        if isinstance(coll_config, str):
            index = coll_config
            archive_paths = None
            acl_paths = None
            default_access = self.default_access
        elif isinstance(coll_config, dict):
            index = coll_config.get('index')
            if not index:
                index = coll_config.get('index_paths')
            archive_paths = coll_config.get('archive_paths')
            acl_paths = coll_config.get('acl_paths')
            default_access = coll_config.get('default_access', self.default_access)
        else:
            raise Exception('collection config must be string or dict')
@ -186,8 +199,13 @@ class WarcServer(BaseWarcServer):
        if not archive_paths:
            archive_paths = self.config.get('archive_paths')
        access_checker = None
        if acl_paths:
            access_checker = AccessChecker(acl_paths, default_access)
        return DefaultResourceHandler(agg, archive_paths,
-                                      rules_file=self.rules_file)
+                                      rules_file=self.rules_file,
                                      access_checker=access_checker)
    def init_sequence(self, coll_name, seq_config):
        if not isinstance(seq_config, list):
--- a/sample_archive/access/allows.aclj
+++ b/sample_archive/access/allows.aclj
@ -0,0 +1,3 @@
 net,example)/test - {"access": "allow"}
 net,domain, - {"access": "allow"}
 net, - {"access": "allow"}
--- a/sample_archive/access/blocks.aclj
+++ b/sample_archive/access/blocks.aclj
@ -0,0 +1,3 @@
 net,example)/abc/path - {"access": "block"}
 com,example)/foo - {"access": "exclude"}
--- a/sample_archive/access/list1.aclj
+++ b/sample_archive/access/list1.aclj
@ -0,0 +1,9 @@
 com,example, - {"access": "exclude"}
 com,example)/abc/page.html - {"access": "allow"}
 com,example)/abc/ef - {"access": "block"}
 com,example)/abc/cd - {"access": "block"}
 com,example)/abc/ab - {"access": "block"}
 com,example)/abc - {"access": "block"}
 com,exampke)/ - {"access": "allow"}
 com,ex)/ - {"access": "exclude"}
 com, - {"access": "allow"}
--- a/sample_archive/access/list2.aclj
+++ b/sample_archive/access/list2.aclj
@ -0,0 +1,2 @@
 com,example)/ - {"access": "allow"}
 bo,example)/ - {"access": "exclude"}
--- a/sample_archive/access/pywb.aclj
+++ b/sample_archive/access/pywb.aclj
@ -0,0 +1,5 @@
 org,iana)/about - {"access": "block"}
 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
 org,iana)/_css - {"access": "exclude"}
 org,example)/?example=1 - {"access": "block"}
 org,iana)/ - {"access": "exclude"}
--- a/tests/config_test_access.yaml
+++ b/tests/config_test_access.yaml
@ -0,0 +1,9 @@
 debug: true
 collections:
    pywb:
        index_paths: ./sample_archive/cdx/
        archive_paths: ./sample_archive/warcs/
        acl_paths: ./sample_archive/access/
        default_access: block
--- a/tests/test_acl.py
+++ b/tests/test_acl.py
@ -0,0 +1,55 @@
 from .base_config_test import BaseConfigTest, fmod
 import webtest
 import os
 from six.moves.urllib.parse import urlencode
 # ============================================================================
 class TestACLApp(BaseConfigTest):
    @classmethod
    def setup_class(cls):
        super(TestACLApp, cls).setup_class('config_test_access.yaml')
    def query(self, url, is_error=False, **params):
        params['url'] = url
        return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
    def test_excluded_url(self):
        resp = self.query('http://www.iana.org/')
        assert len(resp.text.splitlines()) == 0
        self.testapp.get('/pywb/mp_/http://www.iana.org/', status=404)
    def test_blocked_url(self):
        resp = self.query('http://www.iana.org/about/')
        assert len(resp.text.splitlines()) == 1
        resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', status=451)
        assert 'Access Blocked' in resp.text
    def test_allowed_more_specific(self):
        resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf')
        assert resp.status_code == 200
        assert len(resp.text.splitlines()) > 0
        resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf', status=200)
        assert resp.content_type == 'application/octet-stream'
    def test_default_rule_blocked(self):
        resp = self.query('http://httpbin.org/anything/resource.json')
        assert len(resp.text.splitlines()) > 0
        resp = self.testapp.get('/pywb/mp_/http://httpbin.org/anything/resource.json', status=451)
        assert 'Access Blocked' in resp.text
		`@ -0,0 +1,3 @@`
							`net,example)/abc/path - {"access": "block"}`
							`com,example)/foo - {"access": "exclude"}`
		`@ -0,0 +1,2 @@`
							`com,example)/ - {"access": "allow"}`
							`bo,example)/ - {"access": "exclude"}`