binsearch: add range based matching via iter_range()

support for: exact, prefix, host, domain match types
2025-03-24 06:59:52 +01:00 · 2014-02-20 21:21:12 -08:00 · 2014-02-20 21:21:12 -08:00 · a56cbcf62e
commit a56cbcf62e
parent 922917a631
5 changed files with 171 additions and 28 deletions
--- a/pywb/cdx/canonicalize.py
+++ b/pywb/cdx/canonicalize.py
@ -2,6 +2,7 @@
 """
 import surt
 import urlparse
 from cdxobject import CDXException
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
        return surt
 #=================================================================
 def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    """
    Canonicalize a url (either with custom canonicalizer or
    standard canonicalizer with or without surt)
    Then, compute a start and end search url search range
    for a given match type.
    Support match types:
    * exact
    * prefix
    * host
    * domain (only available when for surt ordering)
    Examples below:
    # surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact')
    ('com,example)/path/file.html', 'com,example)/path/file.html!')
    >>> calc_search_range('http://example.com/path/file.html', 'prefix')
    ('com,example)/path/file.html', 'com,example)/path/file.htmm')
    >>> calc_search_range('http://example.com/path/file.html', 'host')
    ('com,example)/', 'com,example*')
    >>> calc_search_range('http://example.com/path/file.html', 'domain')
    ('com,example)/', 'com,example-')
    special case for tld domain range
    >>> calc_search_range('com', 'domain')
    ('com,', 'com-')
    # non-surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact', False)
    ('example.com/path/file.html', 'example.com/path/file.html!')
    >>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
    ('example.com/path/file.html', 'example.com/path/file.htmm')
    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
    ('example.com/', 'example.com0')
    # domain range not supported
    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
    Traceback (most recent call last):
    Exception: matchType=domain unsupported for non-surt
    """
    def inc_last_char(x):
        return x[0:-1] + chr(ord(x[-1]) + 1)
    if not url_canon:
        # make new canon
        url_canon = UrlCanonicalizer(surt_ordered)
    else:
        # ensure surt order matches url_canon
        surt_ordered = url_canon.surt_ordered
    start_key = url_canon(url)
    if match_type == 'exact':
        end_key = start_key + '!'
    elif match_type == 'prefix':
        # add trailing slash if url has it
        if url.endswith('/') and not start_key.endswith('/'):
            start_key += '/'
        end_key = inc_last_char(start_key)
    elif match_type == 'host':
        if surt_ordered:
            host = start_key.split(')/')[0]
            start_key = host + ')/'
            end_key = host + '*'
        else:
            host = urlparse.urlsplit(url).netloc
            start_key = host + '/'
            end_key = host + '0'
    elif match_type == 'domain':
        if not surt_ordered:
            raise Exception('matchType=domain unsupported for non-surt')
        host = start_key.split(')/')[0]
        # if tld, use com, as start_key
        # otherwise, stick with com,example)/
        if not ',' in host:
            start_key = host + ','
        else:
            start_key = host + ')/'
        end_key = host + '-'
    else:
        raise Exception('Invalid match_type: ' + match_type)
    return (start_key, end_key)
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -1,4 +1,4 @@
-from canonicalize import UrlCanonicalizer
+from canonicalize import UrlCanonicalizer, calc_search_range
 from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource
@ -14,8 +14,23 @@ import urlparse
 #=================================================================
 class BaseCDXServer(object):
    def __init__(self, **kwargs):
-        self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
+        ds_rules = kwargs.get('ds_rules')
-        self.fuzzy_query = kwargs.get('fuzzy_query')
+        surt_ordered = kwargs.get('surt_ordered', True)
        # load from domain-specific rules
        if ds_rules:
            self.url_canon, self.fuzzy_query = (
                load_domain_specific_cdx_rules(ds_rules, surt_ordered))
        # or custom passed in canonicalizer
        else:
            self.url_canon = kwargs.get('url_canon')
            self.fuzzy_query = kwargs.get('fuzzy_query')
        # set default canonicalizer if none set thus far
        if not self.url_canon:
            self.url_canon = UrlCanonicalizer(surt_ordered)
        # set perms checker, if any
        self.perms_checker = kwargs.get('perms_checker')
    def _check_cdx_iter(self, cdx_iter, params):
@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)
-            params['key'] = self.url_canon(url)
+            #params['key'] = self.url_canon(url)
            match_type = params.get('matchType', 'exact')
            key, end_key = calc_search_range(url=url,
                                             match_type=match_type,
                                             url_canon=self.url_canon)
            params['key'] = key
            params['end_key'] = end_key
        cdx_iter = cdx_load(self.sources, params, self.perms_checker)
@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None):
    logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
    if ds_rules_file:
        canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
                                                      surt_ordered)
    else:
        canon, fuzzy = None, None
    if not canon:
        canon = UrlCanonicalizer(surt_ordered)
    if (isinstance(paths, str) and
        any(paths.startswith(x) for x in ['http://', 'https://'])):
        server_cls = RemoteCDXServer
@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None):
        server_cls = CDXServer
    return server_cls(paths,
-                      url_canon=canon,
+                      surt_ordered=surt_ordered,
-                      fuzzy_query=fuzzy,
+                      ds_rules=ds_rules_file,
                      perms_checker=perms_checker)
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -1,4 +1,4 @@
-from pywb.utils.binsearch import iter_exact, iter_prefix
+from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 import urllib
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
    def load_cdx(self, params):
        source = SeekableTextFileReader(self.filename)
-
+        return iter_range(source, params.get('key'), params.get('end_key'))
        match_type = params.get('matchType')
        if match_type == 'prefix':
            iter_func = iter_prefix
        else:
            iter_func = iter_exact
        key = params.get('key')
        return iter_func(source, key)
    def __str__(self):
        return 'CDX File - ' + self.filename
--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
    return gen_iter(line)
 #=================================================================
 def iter_range(reader, start, end):
    """
    Creates an iterator which iterates over lines where
    start <= line < end (end exclusive)
    """
    iter_ = search(reader, start)
 #    iter_ = itertools.dropwhile(
 #        lambda line: line < start,
 #        search(reader, start))
    end_iter = itertools.takewhile(
       lambda line: line <= end,
       iter_)
    return end_iter
 #=================================================================
 def iter_prefix(reader, key):
    """
--- a/pywb/utils/test/binsearch_test.py
+++ b/pywb/utils/test/binsearch_test.py
@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 >>> print_binsearch_results('org,iana)/time-zones', iter_exact)
 org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
 # Range Search (end exclusive)
 >>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
 org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
 org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
 org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
 org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
 """
 #=================================================================
 import os
-from pywb.utils.binsearch import iter_prefix, iter_exact
+from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 from pywb import get_test_dir
@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func):
        print line
 def print_binsearch_results_range(key, end_key, iter_func):
    cdx =  SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
    for line in iter_func(cdx, key, end_key):
        print line
 if __name__ == "__main__":
    import doctest
    doctest.testmod()