diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index dcbb07cb..bcf2a4e0 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -78,6 +78,11 @@ class AccessChecker(object): EXACT_SUFFIX = '###' # type: str EXACT_SUFFIX_B = b'###' # type: bytes + # rules in the ACL file are followed by a white space (U+0020): + # for searching we need a match suffix which sorts/compares after + # (resp. before because we use the rev_cmp function). Simply add + # another '#' (U+0023 > U+0020) + EXACT_SUFFIX_SEARCH_B = b'####' # type: bytes def __init__(self, access_source, default_access='allow'): """Initialize a new AccessChecker @@ -148,7 +153,7 @@ class AccessChecker(object): params = {'url': url, 'urlkey': urlkey, 'nosource': 'true', - 'exact_match_suffix': self.EXACT_SUFFIX_B + 'exact_match_suffix': self.EXACT_SUFFIX_SEARCH_B } if collection: params['param.coll'] = collection diff --git a/pywb/warcserver/test/test_access.py b/pywb/warcserver/test/test_access.py index 41a8a11d..d50d65ca 100644 --- a/pywb/warcserver/test/test_access.py +++ b/pywb/warcserver/test/test_access.py @@ -53,6 +53,10 @@ class TestAccess(TempDirTests, BaseTestClass): assert edx['urlkey'] == 'com,example)/foo' assert edx['access'] == 'exclude' + edx = access.find_access_rule('https://example.net/abc/path') + assert edx['urlkey'] == 'net,example)/abc/path' + assert edx['access'] == 'block' + edx = access.find_access_rule('https://example.net/abc/path/other') assert edx['urlkey'] == 'net,example)/abc/path' assert edx['access'] == 'block' @@ -114,7 +118,7 @@ class TestAccess(TempDirTests, BaseTestClass): assert edx['urlkey'] == 'net,example)/abc/path' assert edx['access'] == 'block' - # exact-only matchc + # exact-only match edx = access.find_access_rule('https://www.iana.org/') assert edx['urlkey'] == 'org,iana)/###' assert edx['access'] == 'allow' @@ -127,4 +131,12 @@ class TestAccess(TempDirTests, BaseTestClass): assert edx['urlkey'] == 'org,iana)/' assert edx['access'] == 'exclude' + # exact-only match, first line in *.aclj file + edx = access.find_access_rule('https://www.iana.org/exact/match/first/line/aclj/') + assert edx['urlkey'] == 'org,iana)/exact/match/first/line/aclj###' + assert edx['access'] == 'allow' + # exact-only match, single rule in *.aclj file + edx = access.find_access_rule('https://www.lonesome-rule.org/') + assert edx['urlkey'] == 'org,lonesome-rule)/###' + assert edx['access'] == 'allow' diff --git a/sample_archive/access/pywb.aclj b/sample_archive/access/pywb.aclj index c06ba189..84b7e417 100644 --- a/sample_archive/access/pywb.aclj +++ b/sample_archive/access/pywb.aclj @@ -1,3 +1,4 @@ +org,iana)/exact/match/first/line/aclj### - {"access": "allow", "url": "https://www.iana.org/exact/match/first/line/aclj/"} org,iana)/about - {"access": "block"} org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"} org,iana)/_css - {"access": "exclude"} diff --git a/sample_archive/access/single-line.aclj b/sample_archive/access/single-line.aclj new file mode 100644 index 00000000..bbf2f053 --- /dev/null +++ b/sample_archive/access/single-line.aclj @@ -0,0 +1 @@ +org,lonesome-rule)/### - {"access": "allow", "url": "https://www.lonesome-rule.org/"}