1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

- support for allow/block/exclude access controls (as per ukwa/ukwa-pywb#7)

- .aclj files contain access controls in reverse sorted, CDXJ-like format
- ./sample_archive/acl contains sample acl files
- directory and single-file acl sources (extend directory aggregator and file index source)
- tests for longest-prefix acl match
- tests for acl applied to collection
- pywb.utils.merge -- merge(..., reverse=True) support for py2.7 (backported from py3.5)
- acl types:
  * allow - all allowed
  * block - allowed in index (as blocked) but content not allowed, served as 451
  * exclude - removed from index and content, served as 404
- warcserver: AccessChecker inited if 'acl_paths' specified in custom collections
- exceptions:
  * clean up wbexception, subclasses provide the status code, message loaded automatically
  * warcserver handles AccessException with json response (now with 451 status)
  * pass status to template to allow custom handling
This commit is contained in:
Ilya Kreymer 2018-02-17 21:59:30 -08:00 committed by John Berlin
parent 5b7ca18e0f
commit 77eefcdce6
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
23 changed files with 562 additions and 59 deletions

View File

@ -17,13 +17,14 @@ from pywb.recorder.recorderapp import RecorderApp
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
from pywb.utils.io import StreamIter from pywb.utils.io import StreamIter
from pywb.utils.wbexception import NotFoundException, WbException
from pywb.warcserver.warcserver import WarcServer from pywb.warcserver.warcserver import WarcServer
from pywb.rewrite.templateview import BaseInsertView from pywb.rewrite.templateview import BaseInsertView
from pywb.apps.static_handler import StaticHandler from pywb.apps.static_handler import StaticHandler
from pywb.apps.rewriterapp import RewriterApp, UpstreamException from pywb.apps.rewriterapp import RewriterApp
from pywb.apps.wbrequestresponse import WbResponse from pywb.apps.wbrequestresponse import WbResponse
import os import os
@ -382,8 +383,8 @@ class FrontEndApp(object):
wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '') wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '')
try: try:
response = self.rewriterapp.render_content(wb_url_str, metadata, environ) response = self.rewriterapp.render_content(wb_url_str, metadata, environ)
except UpstreamException as ue: except WbException as wbe:
response = self.rewriterapp.handle_error(environ, ue) response = self.rewriterapp.handle_error(environ, wbe)
raise HTTPException(response=response) raise HTTPException(response=response)
return response return response
@ -446,7 +447,7 @@ class FrontEndApp(object):
:param dict environ: The WSGI environment dictionary for the request :param dict environ: The WSGI environment dictionary for the request
:param str msg: The error message :param str msg: The error message
""" """
raise NotFound(response=self.rewriterapp._error_response(environ, msg)) raise NotFound(response=self.rewriterapp._error_response(environ, NotFoundException(msg)))
def _check_refer_redirect(self, environ): def _check_refer_redirect(self, environ):
"""Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header """Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header
@ -513,7 +514,7 @@ class FrontEndApp(object):
if self.debug: if self.debug:
traceback.print_exc() traceback.print_exc()
response = self.rewriterapp._error_response(environ, 'Internal Error: ' + str(e), '500 Server Error') response = self.rewriterapp._error_response(environ, WbException('Internal Error: ' + str(e)))
return response(environ, start_response) return response(environ, start_response)
@classmethod @classmethod

View File

@ -27,7 +27,11 @@ from pywb.warcserver.index.cdxobject import CDXObject
class UpstreamException(WbException): class UpstreamException(WbException):
def __init__(self, status_code, url, details): def __init__(self, status_code, url, details):
super(UpstreamException, self).__init__(url=url, msg=details) super(UpstreamException, self).__init__(url=url, msg=details)
self.status_code = status_code self._status_code = status_code
@property
def status_code(self):
return self._status_code
# ============================================================================ # ============================================================================
@ -502,24 +506,24 @@ class RewriterApp(object):
top_url += wb_url.to_str(mod='') top_url += wb_url.to_str(mod='')
return top_url return top_url
def handle_error(self, environ, ue): def handle_error(self, environ, wbe):
if ue.status_code == 404: if wbe.status_code == 404:
return self._not_found_response(environ, ue.url) return self._not_found_response(environ, wbe.url)
else: else:
status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error') return self._error_response(environ, wbe)
return self._error_response(environ, ue.url, ue.msg,
status=status)
def _not_found_response(self, environ, url): def _not_found_response(self, environ, url):
resp = self.not_found_view.render_to_string(environ, url=url) resp = self.not_found_view.render_to_string(environ, url=url)
return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html') return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')
def _error_response(self, environ, msg='', details='', status='404 Not Found'): def _error_response(self, environ, wbe):
status = wbe.status()
resp = self.error_view.render_to_string(environ, resp = self.error_view.render_to_string(environ,
err_msg=msg, err_msg=wbe.url,
err_details=details) err_details=wbe.msg,
err_status=wbe.status_code)
return WbResponse.text_response(resp, status=status, content_type='text/html') return WbResponse.text_response(resp, status=status, content_type='text/html')

View File

@ -3,8 +3,11 @@ collections_root: collections
# Per-Collection Paths # Per-Collection Paths
archive_paths: archive archive_paths: archive
index_paths: indexes index_paths: indexes
acl_paths: acl
static_path: static static_path: static
default_access: allow
templates_dir: templates templates_dir: templates
# Template HTML # Template HTML

View File

@ -89,9 +89,13 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
matched = True matched = True
break break
# no matches, so return empty iterator # no matches, so pop last line, but return rest of prev lines, if any
if not matched: if not matched:
return iter([]) if not prev_size or len(prev_deque) <= 1:
return iter([])
prev_deque.popleft()
return iter(prev_deque)
return itertools.chain(prev_deque, iter_) return itertools.chain(prev_deque, iter_)

112
pywb/utils/merge.py Normal file
View File

@ -0,0 +1,112 @@
import sys
if sys.version_info >= (3, 5): #pragma: no cover
from heapq import merge
else: #pragma: no cover
# ported from python 3.5 heapq merge with reverse=True support
from heapq import heapify, heappop, heapreplace
from heapq import _heapify_max, _siftup_max
def _heappop_max(heap):
"""Maxheap version of a heappop."""
lastelt = heap.pop() # raises appropriate IndexError if heap is empty
if heap:
returnitem = heap[0]
heap[0] = lastelt
_siftup_max(heap, 0)
return returnitem
return lastelt
def _heapreplace_max(heap, item):
"""Maxheap version of a heappop followed by a heappush."""
returnitem = heap[0] # raises appropriate IndexError if heap is empty
heap[0] = item
_siftup_max(heap, 0)
return returnitem
def _get_next_iter(it):
return it.__next__ if hasattr(it, '__next__') else it.next
def merge(*iterables, **kwargs):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
If *key* is not None, applies a key function to each element to determine
its sort order.
>>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
['dog', 'cat', 'fish', 'horse', 'kangaroo']
'''
key = kwargs.get('key', None)
reverse = kwargs.get('reverse', False)
h = []
h_append = h.append
if reverse:
_heapify = _heapify_max
_heappop = _heappop_max
_heapreplace = _heapreplace_max
direction = -1
else:
_heapify = heapify
_heappop = heappop
_heapreplace = heapreplace
direction = 1
if key is None:
for order, it in enumerate(map(iter, iterables)):
try:
next = _get_next_iter(it)
h_append([next(), order * direction, next])
except StopIteration:
pass
_heapify(h)
while len(h) > 1:
try:
while True:
value, order, next = s = h[0]
yield value
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except StopIteration:
_heappop(h) # remove empty iterator
if h:
# fast case when only a single iterator remains
value, order, next = h[0]
yield value
for v in next.__self__:
yield v
return
for order, it in enumerate(map(iter, iterables)):
try:
next = _get_next_iter(it)
value = next()
h_append([key(value), order * direction, value, next])
except StopIteration:
pass
_heapify(h)
while len(h) > 1:
try:
while True:
key_value, order, value, next = s = h[0]
yield value
value = next()
s[0] = key(value)
s[2] = value
_heapreplace(h, s)
except StopIteration:
_heappop(h)
if h:
key_value, order, value, next = h[0]
yield value
for v in next.__self__:
yield v

View File

@ -23,6 +23,9 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
>>> print_binsearch_results('org,iana)/time-zones', iter_exact) >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
>>> print_binsearch_results_range('org,iana)/time-zones', 'org,iana)/time-zones!', iter_range)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# Exact search -- no matches # Exact search -- no matches
>>> print_binsearch_results('org,iaana)/', iter_exact) >>> print_binsearch_results('org,iaana)/', iter_exact)
>>> print_binsearch_results('org,ibna)/', iter_exact) >>> print_binsearch_results('org,ibna)/', iter_exact)
@ -74,6 +77,7 @@ org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200
#================================================================= #=================================================================
import os import os
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
from pywb.utils.merge import merge
from pywb import get_test_dir from pywb import get_test_dir
@ -91,6 +95,22 @@ def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
print(line.decode('utf-8')) print(line.decode('utf-8'))
def test_rev_merge():
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
lines1 = cdx.readlines()
with open(test_cdx_dir + 'dupes.cdx', 'rb') as cdx:
lines2 = cdx.readlines()
# check reverse merge: verify merging of lists, than reversing
# eqauls merging with reverse=True of reversed lists
assert (list(reversed(list(merge(lines1, lines2)))) ==
list(merge(reversed(lines1), reversed(lines2), reverse=True)))
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -1,3 +1,4 @@
from werkzeug.http import HTTP_STATUS_CODES
#================================================================= #=================================================================
@ -7,6 +8,13 @@ class WbException(Exception):
self.msg = msg self.msg = msg
self.url = url self.url = url
@property
def status_code(self):
return 500
def status(self):
return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
def __repr__(self): def __repr__(self):
return "{0}('{1}',)".format(self.__class__.__name__, self.msg) return "{0}('{1}',)".format(self.__class__.__name__, self.msg)
@ -17,25 +25,28 @@ class WbException(Exception):
#================================================================= #=================================================================
class AccessException(WbException): class AccessException(WbException):
def status(self): @property
return '403 Access Denied' def status_code(self):
return 451
#================================================================= #=================================================================
class BadRequestException(WbException): class BadRequestException(WbException):
def status(self): @property
return '400 Bad Request' def status_code(self):
return 400
#================================================================= #=================================================================
class NotFoundException(WbException): class NotFoundException(WbException):
def status(self): @property
return '404 Not Found' def status_code(self):
return 404
#================================================================= #=================================================================
class LiveResourceException(WbException): class LiveResourceException(WbException):
def status(self): @property
return '400 Bad Live Resource' def status_code(self):
return 400

View File

@ -0,0 +1,91 @@
from pywb.warcserver.index.indexsource import FileIndexSource
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
from pywb.utils.binsearch import search
from pywb.utils.merge import merge
import os
# ============================================================================
class FileAccessIndexSource(FileIndexSource):
@staticmethod
def rev_cmp(a, b):
return (a < b) - (a > b)
def _get_gen(self, fh, params):
return search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp)
# ============================================================================
class DirectoryAccessSource(DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
def _merge(self, iter_list):
return merge(*(iter_list), reverse=True)
# ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
pass
# ============================================================================
class AccessChecker(object):
def __init__(self, access_source_file, default_access='allow'):
if isinstance(access_source_file, str):
self.aggregator = self.create_access_aggregator(access_source_file)
else:
self.aggregator = access_source_file
self.default_rule = {'urlkey': '', 'access': default_access}
def create_access_aggregator(self, filename):
if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename)
elif os.path.isfile(filename):
return FileAccessIndexSource(filename)
else:
raise Exception('Invalid Access Source: ' + filename)
def find_access_rule(self, url, ts=None, urlkey=None):
params = {'url': url, 'urlkey': urlkey}
cdx_iter, errs = self.aggregator(params)
if errs:
print(errs)
key = params['key'].decode('utf-8')
for cdx in cdx_iter:
if 'urlkey' not in cdx:
continue
if key.startswith(cdx['urlkey']):
return cdx
return self.default_rule
def __call__(self, res):
cdx_iter, errs = res
return self.wrap_iter(cdx_iter), errs
def wrap_iter(self, cdx_iter):
last_rule = None
last_url = None
for cdx in cdx_iter:
url = cdx.get('url')
# if no url, possible idx or other object, don't apply any checks and pass through
if not url:
yield cdx
continue
rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'))
access = rule.get('access', 'exclude')
if access == 'exclude':
continue
cdx['access'] = access
yield cdx

View File

@ -1,6 +1,8 @@
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from pywb.utils.format import query_to_dict from pywb.utils.format import query_to_dict
from pywb.utils.wbexception import AccessException
from werkzeug.routing import Map, Rule from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException from werkzeug.exceptions import HTTPException
@ -90,6 +92,12 @@ class BaseWarcServer(object):
start_response('200 OK', list(out_headers.items())) start_response('200 OK', list(out_headers.items()))
return res return res
except AccessException as ae:
out_headers = {}
res = self.json_encode(ae.msg, out_headers)
start_response(ae.status(), list(out_headers.items()))
return res
except Exception as e: except Exception as e:
if self.debug: if self.debug:
traceback.print_exc() traceback.print_exc()
@ -107,6 +115,7 @@ class BaseWarcServer(object):
def send_error(self, errs, start_response, def send_error(self, errs, start_response,
message='No Resource Found', status=404): message='No Resource Found', status=404):
last_exc = errs.pop('last_exc', None) last_exc = errs.pop('last_exc', None)
if last_exc: if last_exc:
if self.debug: if self.debug:

View File

@ -1,4 +1,4 @@
from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import BadRequestException, WbException, AccessException
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
@ -48,6 +48,7 @@ class IndexHandler(object):
self.index_source = index_source self.index_source = index_source
self.opts = opts or {} self.opts = opts or {}
self.fuzzy = FuzzyMatcher(kwargs.get('rules_file')) self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
self.access_checker = kwargs.get('access_checker')
def get_supported_modes(self): def get_supported_modes(self):
return dict(modes=['list_sources', 'index']) return dict(modes=['list_sources', 'index'])
@ -62,7 +63,12 @@ class IndexHandler(object):
if input_req: if input_req:
params['alt_url'] = input_req.include_method_query(url) params['alt_url'] = input_req.include_method_query(url)
return self.fuzzy(self.index_source, params) cdx_iter = self.fuzzy(self.index_source, params)
if self.access_checker:
cdx_iter = self.access_checker(cdx_iter)
return cdx_iter
def __call__(self, params): def __call__(self, params):
mode = params.get('mode', 'index') mode = params.get('mode', 'index')
@ -101,8 +107,8 @@ class IndexHandler(object):
#============================================================================= #=============================================================================
class ResourceHandler(IndexHandler): class ResourceHandler(IndexHandler):
def __init__(self, index_source, resource_loaders, rules_file=None): def __init__(self, index_source, resource_loaders, **kwargs):
super(ResourceHandler, self).__init__(index_source, rules_file=rules_file) super(ResourceHandler, self).__init__(index_source, **kwargs)
self.resource_loaders = resource_loaders self.resource_loaders = resource_loaders
def get_supported_modes(self): def get_supported_modes(self):
@ -121,6 +127,11 @@ class ResourceHandler(IndexHandler):
last_exc = None last_exc = None
for cdx in cdx_iter: for cdx in cdx_iter:
if cdx.get('access', 'allow') != 'allow':
raise AccessException(msg={'access': cdx['access'],
'access_status': cdx.get('access_status', 451)},
url=cdx['url'])
for loader in self.resource_loaders: for loader in self.resource_loaders:
try: try:
out_headers, resp = loader(cdx, params) out_headers, resp = loader(cdx, params)
@ -141,13 +152,12 @@ class ResourceHandler(IndexHandler):
#============================================================================= #=============================================================================
class DefaultResourceHandler(ResourceHandler): class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths='', forward_proxy_prefix='', def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
rules_file=''): **kwargs):
loaders = [WARCPathLoader(warc_paths, index_source), loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader(forward_proxy_prefix), LiveWebLoader(forward_proxy_prefix),
VideoLoader() VideoLoader()
] ]
super(DefaultResourceHandler, self).__init__(index_source, loaders, super(DefaultResourceHandler, self).__init__(index_source, loaders, **kwargs)
rules_file=rules_file)
#============================================================================= #=============================================================================

View File

@ -90,10 +90,13 @@ class BaseAggregator(object):
if len(iter_list) <= 1: if len(iter_list) <= 1:
cdx_iter = iter_list[0] if iter_list else iter([]) cdx_iter = iter_list[0] if iter_list else iter([])
else: else:
cdx_iter = merge(*(iter_list)) cdx_iter = self._merge(iter_list)
return cdx_iter, err_list return cdx_iter, err_list
def _merge(self, iter_list):
return merge(*(iter_list))
def _on_source_error(self, name): #pragma: no cover def _on_source_error(self, name): #pragma: no cover
pass pass
@ -257,6 +260,11 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
#============================================================================= #=============================================================================
class BaseDirectoryIndexSource(BaseAggregator): class BaseDirectoryIndexSource(BaseAggregator):
INDEX_SOURCES = [
(FileIndexSource.CDX_EXT, FileIndexSource),
(ZipNumIndexSource.IDX_EXT, ZipNumIndexSource)
]
def __init__(self, base_prefix, base_dir='', name='', config=None): def __init__(self, base_prefix, base_dir='', name='', config=None):
self.base_prefix = base_prefix self.base_prefix = base_prefix
self.base_dir = base_dir self.base_dir = base_dir
@ -280,13 +288,13 @@ class BaseDirectoryIndexSource(BaseAggregator):
def _load_files_single_dir(self, the_dir): def _load_files_single_dir(self, the_dir):
for name in os.listdir(the_dir): for name in os.listdir(the_dir):
filename = os.path.join(the_dir, name) for ext, cls in self.INDEX_SOURCES:
if not name.endswith(ext):
continue
is_cdx = filename.endswith(FileIndexSource.CDX_EXT) filename = os.path.join(the_dir, name)
is_zip = filename.endswith(ZipNumIndexSource.IDX_EXT)
if is_cdx or is_zip: #print('Adding ' + filename)
#print('Adding ' + filename)
rel_path = os.path.relpath(the_dir, self.base_prefix) rel_path = os.path.relpath(the_dir, self.base_prefix)
if rel_path == '.': if rel_path == '.':
full_name = name full_name = name
@ -296,10 +304,7 @@ class BaseDirectoryIndexSource(BaseAggregator):
if self.name: if self.name:
full_name = self.name + ':' + full_name full_name = self.name + ':' + full_name
if is_cdx: index_src = cls(filename, self.config)
index_src = FileIndexSource(filename)
else:
index_src = ZipNumIndexSource(filename, self.config)
yield full_name, index_src yield full_name, index_src
@ -341,9 +346,9 @@ class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
#============================================================================= #=============================================================================
class CacheDirectoryIndexSource(DirectoryIndexSource): class CacheDirectoryMixin(object):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(CacheDirectoryIndexSource, self).__init__(*args, **kwargs) super(CacheDirectoryMixin, self).__init__(*args, **kwargs)
self.cached_file_list = {} self.cached_file_list = {}
def _load_files_single_dir(self, the_dir): def _load_files_single_dir(self, the_dir):
@ -360,12 +365,17 @@ class CacheDirectoryIndexSource(DirectoryIndexSource):
print('Dir {0} unchanged'.format(the_dir)) print('Dir {0} unchanged'.format(the_dir))
return files return files
files = super(CacheDirectoryIndexSource, self)._load_files_single_dir(the_dir) files = super(CacheDirectoryMixin, self)._load_files_single_dir(the_dir)
files = list(files) files = list(files)
self.cached_file_list[the_dir] = (stat, files) self.cached_file_list[the_dir] = (stat, files)
return files return files
#=============================================================================
class CacheDirectoryIndexSource(CacheDirectoryMixin, DirectoryIndexSource):
pass
#============================================================================= #=============================================================================
class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource): class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource):
def _iter_sources(self, params): def _iter_sources(self, params):

View File

@ -36,8 +36,9 @@ ORIG_FILENAME = 'orig.filename'
#================================================================= #=================================================================
class CDXException(WbException): class CDXException(WbException):
def status(self): @property
return '400 Bad Request' def status_code(self):
return 400
#================================================================= #=================================================================
@ -132,7 +133,7 @@ class CDXObject(OrderedDict):
v = quote(v.encode('utf-8'), safe=':/') v = quote(v.encode('utf-8'), safe=':/')
if n != 'filename': if n != 'filename':
v = to_native_str(v, 'utf-8') v = to_native_str(v, 'utf-8') or v
self[n] = v self[n] = v

View File

@ -67,21 +67,26 @@ class BaseIndexSource(object):
class FileIndexSource(BaseIndexSource): class FileIndexSource(BaseIndexSource):
CDX_EXT = ('.cdx', '.cdxj') CDX_EXT = ('.cdx', '.cdxj')
def __init__(self, filename): def __init__(self, filename, config=None):
self.filename_template = filename self.filename_template = filename
def _do_open(self, filename):
try:
return open(filename, 'rb')
except IOError:
raise NotFoundException(filename)
def _get_gen(self, fh, params):
return iter_range(fh, params['key'], params['end_key'])
def load_index(self, params): def load_index(self, params):
filename = res_template(self.filename_template, params) filename = res_template(self.filename_template, params)
try: fh = self._do_open(filename)
fh = open(filename, 'rb')
except IOError:
raise NotFoundException(filename)
def do_load(fh): def do_load(fh):
with fh: with fh:
gen = iter_range(fh, params['key'], params['end_key']) for line in self._get_gen(fh, params):
for line in gen:
yield CDXObject(line) yield CDXObject(line)
return do_load(fh) return do_load(fh)

View File

@ -0,0 +1,116 @@
from mock import patch
import shutil
import os
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.access_checker import FileAccessIndexSource, AccessChecker, DirectoryAccessSource
from pywb.warcserver.test.testutils import to_path, TempDirTests, BaseTestClass
from pywb import get_test_dir
TEST_EXCL_PATH = to_path(get_test_dir() + '/access/')
# ============================================================================
class TestAccess(TempDirTests, BaseTestClass):
def test_allows_only_default_block(self):
agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'allows.aclj')})
access = AccessChecker(agg, default_access='block')
edx = access.find_access_rule('http://example.net')
assert edx['urlkey'] == 'net,'
edx = access.find_access_rule('http://foo.example.net/abc')
assert edx['urlkey'] == 'net,'
edx = access.find_access_rule('https://example.net/test/')
assert edx['urlkey'] == 'net,example)/test'
edx = access.find_access_rule('https://example.org/')
assert edx['urlkey'] == ''
assert edx['access'] == 'block'
edx = access.find_access_rule('https://abc.domain.net/path')
assert edx['urlkey'] == 'net,domain,'
edx = access.find_access_rule('https://domain.neta/path')
assert edx['urlkey'] == ''
assert edx['access'] == 'block'
def test_blocks_only(self):
agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'blocks.aclj')})
access = AccessChecker(agg)
edx = access.find_access_rule('https://example.com/foo')
assert edx['urlkey'] == 'com,example)/foo'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('https://example.com/food')
assert edx['urlkey'] == 'com,example)/foo'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('https://example.com/foo/path')
assert edx['urlkey'] == 'com,example)/foo'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('https://example.net/abc/path/other')
assert edx['urlkey'] == 'net,example)/abc/path'
assert edx['access'] == 'block'
edx = access.find_access_rule('https://example.net/fo')
assert edx['urlkey'] == ''
assert edx['access'] == 'allow'
def test_single_file_combined(self):
agg = SimpleAggregator({'source': FileAccessIndexSource(TEST_EXCL_PATH + 'list1.aclj')})
access = AccessChecker(agg, default_access='block')
edx = access.find_access_rule('http://example.com/abc/page.html')
assert edx['urlkey'] == 'com,example)/abc/page.html'
assert edx['access'] == 'allow'
edx = access.find_access_rule('http://example.com/abc/page.htm')
assert edx['urlkey'] == 'com,example)/abc'
assert edx['access'] == 'block'
edx = access.find_access_rule('http://example.com/abc/')
assert edx['urlkey'] == 'com,example)/abc'
assert edx['access'] == 'block'
edx = access.find_access_rule('http://foo.example.com/')
assert edx['urlkey'] == 'com,example,'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('http://example.com/')
assert edx['urlkey'] == 'com,'
assert edx['access'] == 'allow'
edx = access.find_access_rule('foo.net')
assert edx['urlkey'] == ''
assert edx['access'] == 'block'
edx = access.find_access_rule('https://example.net/abc/path/other')
assert edx['urlkey'] == ''
assert edx['access'] == 'block'
def test_excludes_dir(self):
agg = DirectoryAccessSource(TEST_EXCL_PATH)
access = AccessChecker(agg, default_access='block')
edx = access.find_access_rule('http://example.com/')
assert edx['urlkey'] == 'com,example)/'
assert edx['access'] == 'allow'
edx = access.find_access_rule('http://example.bo')
assert edx['urlkey'] == 'bo,example)/'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('https://example.com/foo/path')
assert edx['urlkey'] == 'com,example)/foo'
assert edx['access'] == 'exclude'
edx = access.find_access_rule('https://example.net/abc/path/other')
assert edx['urlkey'] == 'net,example)/abc/path'
assert edx['access'] == 'block'

View File

@ -1,3 +1,5 @@
debug: true
collections: collections:
# Live Index # Live Index

View File

@ -14,6 +14,8 @@ from pywb.warcserver.index.indexsource import XmlQueryIndexSource
from pywb.warcserver.index.zipnum import ZipNumIndexSource from pywb.warcserver.index.zipnum import ZipNumIndexSource
from pywb.warcserver.access_checker import AccessChecker, CacheDirectoryAccessSource
from pywb import DEFAULT_CONFIG from pywb import DEFAULT_CONFIG
from six import iteritems, iterkeys, itervalues from six import iteritems, iterkeys, itervalues
@ -60,6 +62,9 @@ class WarcServer(BaseWarcServer):
self.root_dir = self.config.get('collections_root', '') self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths') self.index_paths = self.init_paths('index_paths')
self.archive_paths = self.init_paths('archive_paths', self.root_dir) self.archive_paths = self.init_paths('archive_paths', self.root_dir)
self.acl_paths = self.init_paths('acl_paths')
self.default_access = self.config.get('default_access')
self.rules_file = self.config.get('rules_file', '') self.rules_file = self.config.get('rules_file', '')
@ -103,8 +108,12 @@ class WarcServer(BaseWarcServer):
base_dir=self.index_paths, base_dir=self.index_paths,
config=self.config) config=self.config)
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
self.default_access)
return DefaultResourceHandler(dir_source, self.archive_paths, return DefaultResourceHandler(dir_source, self.archive_paths,
rules_file=self.rules_file) rules_file=self.rules_file,
access_checker=access_checker)
def list_fixed_routes(self): def list_fixed_routes(self):
return list(self.fixed_routes.keys()) return list(self.fixed_routes.keys())
@ -156,11 +165,15 @@ class WarcServer(BaseWarcServer):
if isinstance(coll_config, str): if isinstance(coll_config, str):
index = coll_config index = coll_config
archive_paths = None archive_paths = None
acl_paths = None
default_access = self.default_access
elif isinstance(coll_config, dict): elif isinstance(coll_config, dict):
index = coll_config.get('index') index = coll_config.get('index')
if not index: if not index:
index = coll_config.get('index_paths') index = coll_config.get('index_paths')
archive_paths = coll_config.get('archive_paths') archive_paths = coll_config.get('archive_paths')
acl_paths = coll_config.get('acl_paths')
default_access = coll_config.get('default_access', self.default_access)
else: else:
raise Exception('collection config must be string or dict') raise Exception('collection config must be string or dict')
@ -186,8 +199,13 @@ class WarcServer(BaseWarcServer):
if not archive_paths: if not archive_paths:
archive_paths = self.config.get('archive_paths') archive_paths = self.config.get('archive_paths')
access_checker = None
if acl_paths:
access_checker = AccessChecker(acl_paths, default_access)
return DefaultResourceHandler(agg, archive_paths, return DefaultResourceHandler(agg, archive_paths,
rules_file=self.rules_file) rules_file=self.rules_file,
access_checker=access_checker)
def init_sequence(self, coll_name, seq_config): def init_sequence(self, coll_name, seq_config):
if not isinstance(seq_config, list): if not isinstance(seq_config, list):

View File

@ -0,0 +1,3 @@
net,example)/test - {"access": "allow"}
net,domain, - {"access": "allow"}
net, - {"access": "allow"}

View File

@ -0,0 +1,3 @@
net,example)/abc/path - {"access": "block"}
com,example)/foo - {"access": "exclude"}

View File

@ -0,0 +1,9 @@
com,example, - {"access": "exclude"}
com,example)/abc/page.html - {"access": "allow"}
com,example)/abc/ef - {"access": "block"}
com,example)/abc/cd - {"access": "block"}
com,example)/abc/ab - {"access": "block"}
com,example)/abc - {"access": "block"}
com,exampke)/ - {"access": "allow"}
com,ex)/ - {"access": "exclude"}
com, - {"access": "allow"}

View File

@ -0,0 +1,2 @@
com,example)/ - {"access": "allow"}
bo,example)/ - {"access": "exclude"}

View File

@ -0,0 +1,5 @@
org,iana)/about - {"access": "block"}
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
org,iana)/_css - {"access": "exclude"}
org,example)/?example=1 - {"access": "block"}
org,iana)/ - {"access": "exclude"}

View File

@ -0,0 +1,9 @@
debug: true
collections:
pywb:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths: ./sample_archive/access/
default_access: block

55
tests/test_acl.py Normal file
View File

@ -0,0 +1,55 @@
from .base_config_test import BaseConfigTest, fmod
import webtest
import os
from six.moves.urllib.parse import urlencode
# ============================================================================
class TestACLApp(BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestACLApp, cls).setup_class('config_test_access.yaml')
def query(self, url, is_error=False, **params):
params['url'] = url
return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
def test_excluded_url(self):
resp = self.query('http://www.iana.org/')
assert len(resp.text.splitlines()) == 0
self.testapp.get('/pywb/mp_/http://www.iana.org/', status=404)
def test_blocked_url(self):
resp = self.query('http://www.iana.org/about/')
assert len(resp.text.splitlines()) == 1
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', status=451)
assert 'Access Blocked' in resp.text
def test_allowed_more_specific(self):
resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf')
assert resp.status_code == 200
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf', status=200)
assert resp.content_type == 'application/octet-stream'
def test_default_rule_blocked(self):
resp = self.query('http://httpbin.org/anything/resource.json')
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb/mp_/http://httpbin.org/anything/resource.json', status=451)
assert 'Access Blocked' in resp.text