binsearch: add range based matching via iter_range()

support for: exact, prefix, host, domain match types
2025-03-15 00:03:28 +01:00 · 2014-02-20 21:21:12 -08:00 · 2014-02-20 21:21:12 -08:00 · a56cbcf62e
commit a56cbcf62e
parent 922917a631
5 changed files with 171 additions and 28 deletions
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -2,6 +2,7 @@
 """

 import surt
+import urlparse
 from cdxobject import CDXException


@ -69,6 +70,109 @@ index.html?a=b?c=)/')
        return surt


+#=================================================================
+def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
+    """
+    Canonicalize a url (either with custom canonicalizer or
+    standard canonicalizer with or without surt)
+
+    Then, compute a start and end search url search range
+    for a given match type.
+
+    Support match types:
+    * exact
+    * prefix
+    * host
+    * domain (only available when for surt ordering)
+
+    Examples below:
+
+    # surt ranges
+    >>> calc_search_range('http://example.com/path/file.html', 'exact')
+    ('com,example)/path/file.html', 'com,example)/path/file.html!')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'prefix')
+    ('com,example)/path/file.html', 'com,example)/path/file.htmm')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'host')
+    ('com,example)/', 'com,example*')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'domain')
+    ('com,example)/', 'com,example-')
+
+    special case for tld domain range
+    >>> calc_search_range('com', 'domain')
+    ('com,', 'com-')
+
+    # non-surt ranges
+    >>> calc_search_range('http://example.com/path/file.html', 'exact', False)
+    ('example.com/path/file.html', 'example.com/path/file.html!')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
+    ('example.com/path/file.html', 'example.com/path/file.htmm')
+
+    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
+    ('example.com/', 'example.com0')
+
+    # domain range not supported
+    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
+    Traceback (most recent call last):
+    Exception: matchType=domain unsupported for non-surt
+    """
+    def inc_last_char(x):
+        return x[0:-1] + chr(ord(x[-1]) + 1)
+
+    if not url_canon:
+        # make new canon
+        url_canon = UrlCanonicalizer(surt_ordered)
+    else:
+        # ensure surt order matches url_canon
+        surt_ordered = url_canon.surt_ordered
+
+    start_key = url_canon(url)
+
+    if match_type == 'exact':
+        end_key = start_key + '!'
+
+    elif match_type == 'prefix':
+        # add trailing slash if url has it
+        if url.endswith('/') and not start_key.endswith('/'):
+            start_key += '/'
+
+        end_key = inc_last_char(start_key)
+
+    elif match_type == 'host':
+        if surt_ordered:
+            host = start_key.split(')/')[0]
+
+            start_key = host + ')/'
+            end_key = host + '*'
+        else:
+            host = urlparse.urlsplit(url).netloc
+
+            start_key = host + '/'
+            end_key = host + '0'
+
+    elif match_type == 'domain':
+        if not surt_ordered:
+            raise Exception('matchType=domain unsupported for non-surt')
+
+        host = start_key.split(')/')[0]
+
+        # if tld, use com, as start_key
+        # otherwise, stick with com,example)/
+        if not ',' in host:
+            start_key = host + ','
+        else:
+            start_key = host + ')/'
+
+        end_key = host + '-'
+    else:
+        raise Exception('Invalid match_type: ' + match_type)
+
+    return (start_key, end_key)
+
+
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,4 +1,4 @@
-from canonicalize import UrlCanonicalizer
+from canonicalize import UrlCanonicalizer, calc_search_range

 from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource
@ -14,8 +14,23 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
    def __init__(self, **kwargs):
-        self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
-        self.fuzzy_query = kwargs.get('fuzzy_query')
+        ds_rules = kwargs.get('ds_rules')
+        surt_ordered = kwargs.get('surt_ordered', True)
+
+        # load from domain-specific rules
+        if ds_rules:
+            self.url_canon, self.fuzzy_query = (
+                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
+        # or custom passed in canonicalizer
+        else:
+            self.url_canon = kwargs.get('url_canon')
+            self.fuzzy_query = kwargs.get('fuzzy_query')
+
+        # set default canonicalizer if none set thus far
+        if not self.url_canon:
+            self.url_canon = UrlCanonicalizer(surt_ordered)
+
+        # set perms checker, if any
        self.perms_checker = kwargs.get('perms_checker')

    def _check_cdx_iter(self, cdx_iter, params):
@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)

-            params['key'] = self.url_canon(url)
+            #params['key'] = self.url_canon(url)
+            match_type = params.get('matchType', 'exact')
+
+            key, end_key = calc_search_range(url=url,
+                                             match_type=match_type,
+                                             url_canon=self.url_canon)
+            params['key'] = key
+            params['end_key'] = end_key

        cdx_iter = cdx_load(self.sources, params, self.perms_checker)

@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None):

    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))

-    if ds_rules_file:
-        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
-                                                      surt_ordered)
-    else:
-        canon, fuzzy = None, None
-
-    if not canon:
-        canon = UrlCanonicalizer(surt_ordered)
-
    if (isinstance(paths, str) and
        any(paths.startswith(x) for x in ['http://', 'https://'])):
        server_cls = RemoteCDXServer
@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None):
        server_cls = CDXServer

    return server_cls(paths,
-                      url_canon=canon,
-                      fuzzy_query=fuzzy,
+                      surt_ordered=surt_ordered,
+                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)


--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,4 +1,4 @@
-from pywb.utils.binsearch import iter_exact, iter_prefix
+from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader

 import urllib
@ -24,17 +24,7 @@ class CDXFile(CDXSource):

    def load_cdx(self, params):
        source = SeekableTextFileReader(self.filename)
-
-        match_type = params.get('matchType')
-
-        if match_type == 'prefix':
-            iter_func = iter_prefix
-        else:
-            iter_func = iter_exact
-
-        key = params.get('key')
-
-        return iter_func(source, key)
+        return iter_range(source, params.get('key'), params.get('end_key'))

    def __str__(self):
        return 'CDX File - ' + self.filename
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    return gen_iter(line)


+#=================================================================
+def iter_range(reader, start, end):
+    """
+    Creates an iterator which iterates over lines where
+    start <= line < end (end exclusive)
+    """
+
+    iter_ = search(reader, start)
+
+#    iter_ = itertools.dropwhile(
+#        lambda line: line < start,
+#        search(reader, start))
+
+    end_iter = itertools.takewhile(
+       lambda line: line <= end,
+       iter_)
+
+    return end_iter
+
+
 #=================================================================
 def iter_prefix(reader, key):
    """
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex

 >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+
+
+# Range Search (end exclusive)
+>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
+org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
+org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
+org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
+org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
+org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
 """


 #=================================================================
 import os
-from pywb.utils.binsearch import iter_prefix, iter_exact
+from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
 from pywb.utils.loaders import SeekableTextFileReader

 from pywb import get_test_dir
@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func):
        print line


+def print_binsearch_results_range(key, end_key, iter_func):
+    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
+
+    for line in iter_func(cdx, key, end_key):
+        print line
+
+
 if __name__ == "__main__":
    import doctest
    doctest.testmod()