From a56cbcf62e39a330398dab9f0d8020e4fcef177f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 20 Feb 2014 21:21:12 -0800 Subject: [PATCH] binsearch: add range based matching via iter_range() support for: exact, prefix, host, domain match types --- pywb/cdx/canonicalize.py | 104 ++++++++++++++++++++++++++++++ pywb/cdx/cdxserver.py | 43 +++++++----- pywb/cdx/cdxsource.py | 14 +--- pywb/utils/binsearch.py | 20 ++++++ pywb/utils/test/binsearch_test.py | 18 +++++- 5 files changed, 171 insertions(+), 28 deletions(-) diff --git a/pywb/cdx/canonicalize.py b/pywb/cdx/canonicalize.py index e0adb5c1..e2f818b9 100644 --- a/pywb/cdx/canonicalize.py +++ b/pywb/cdx/canonicalize.py @@ -2,6 +2,7 @@ """ import surt +import urlparse from cdxobject import CDXException @@ -69,6 +70,109 @@ index.html?a=b?c=)/') return surt +#================================================================= +def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): + """ + Canonicalize a url (either with custom canonicalizer or + standard canonicalizer with or without surt) + + Then, compute a start and end search url search range + for a given match type. + + Support match types: + * exact + * prefix + * host + * domain (only available when for surt ordering) + + Examples below: + + # surt ranges + >>> calc_search_range('http://example.com/path/file.html', 'exact') + ('com,example)/path/file.html', 'com,example)/path/file.html!') + + >>> calc_search_range('http://example.com/path/file.html', 'prefix') + ('com,example)/path/file.html', 'com,example)/path/file.htmm') + + >>> calc_search_range('http://example.com/path/file.html', 'host') + ('com,example)/', 'com,example*') + + >>> calc_search_range('http://example.com/path/file.html', 'domain') + ('com,example)/', 'com,example-') + + special case for tld domain range + >>> calc_search_range('com', 'domain') + ('com,', 'com-') + + # non-surt ranges + >>> calc_search_range('http://example.com/path/file.html', 'exact', False) + ('example.com/path/file.html', 'example.com/path/file.html!') + + >>> calc_search_range('http://example.com/path/file.html', 'prefix', False) + ('example.com/path/file.html', 'example.com/path/file.htmm') + + >>> calc_search_range('http://example.com/path/file.html', 'host', False) + ('example.com/', 'example.com0') + + # domain range not supported + >>> calc_search_range('http://example.com/path/file.html', 'domain', False) + Traceback (most recent call last): + Exception: matchType=domain unsupported for non-surt + """ + def inc_last_char(x): + return x[0:-1] + chr(ord(x[-1]) + 1) + + if not url_canon: + # make new canon + url_canon = UrlCanonicalizer(surt_ordered) + else: + # ensure surt order matches url_canon + surt_ordered = url_canon.surt_ordered + + start_key = url_canon(url) + + if match_type == 'exact': + end_key = start_key + '!' + + elif match_type == 'prefix': + # add trailing slash if url has it + if url.endswith('/') and not start_key.endswith('/'): + start_key += '/' + + end_key = inc_last_char(start_key) + + elif match_type == 'host': + if surt_ordered: + host = start_key.split(')/')[0] + + start_key = host + ')/' + end_key = host + '*' + else: + host = urlparse.urlsplit(url).netloc + + start_key = host + '/' + end_key = host + '0' + + elif match_type == 'domain': + if not surt_ordered: + raise Exception('matchType=domain unsupported for non-surt') + + host = start_key.split(')/')[0] + + # if tld, use com, as start_key + # otherwise, stick with com,example)/ + if not ',' in host: + start_key = host + ',' + else: + start_key = host + ')/' + + end_key = host + '-' + else: + raise Exception('Invalid match_type: ' + match_type) + + return (start_key, end_key) + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 69f19d21..41cc46cc 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer +from canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource @@ -14,8 +14,23 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - self.url_canon = kwargs.get('url_canon', UrlCanonicalizer()) - self.fuzzy_query = kwargs.get('fuzzy_query') + ds_rules = kwargs.get('ds_rules') + surt_ordered = kwargs.get('surt_ordered', True) + + # load from domain-specific rules + if ds_rules: + self.url_canon, self.fuzzy_query = ( + load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + # or custom passed in canonicalizer + else: + self.url_canon = kwargs.get('url_canon') + self.fuzzy_query = kwargs.get('fuzzy_query') + + # set default canonicalizer if none set thus far + if not self.url_canon: + self.url_canon = UrlCanonicalizer(surt_ordered) + + # set perms checker, if any self.perms_checker = kwargs.get('perms_checker') def _check_cdx_iter(self, cdx_iter, params): @@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - params['key'] = self.url_canon(url) + #params['key'] = self.url_canon(url) + match_type = params.get('matchType', 'exact') + + key, end_key = calc_search_range(url=url, + match_type=match_type, + url_canon=self.url_canon) + params['key'] = key + params['end_key'] = end_key cdx_iter = cdx_load(self.sources, params, self.perms_checker) @@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None): logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) - if ds_rules_file: - canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file, - surt_ordered) - else: - canon, fuzzy = None, None - - if not canon: - canon = UrlCanonicalizer(surt_ordered) - if (isinstance(paths, str) and any(paths.startswith(x) for x in ['http://', 'https://'])): server_cls = RemoteCDXServer @@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None): server_cls = CDXServer return server_cls(paths, - url_canon=canon, - fuzzy_query=fuzzy, + surt_ordered=surt_ordered, + ds_rules=ds_rules_file, perms_checker=perms_checker) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index a8c92be5..39285cf8 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,4 +1,4 @@ -from pywb.utils.binsearch import iter_exact, iter_prefix +from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader import urllib @@ -24,17 +24,7 @@ class CDXFile(CDXSource): def load_cdx(self, params): source = SeekableTextFileReader(self.filename) - - match_type = params.get('matchType') - - if match_type == 'prefix': - iter_func = iter_prefix - else: - iter_func = iter_exact - - key = params.get('key') - - return iter_func(source, key) + return iter_range(source, params.get('key'), params.get('end_key')) def __str__(self): return 'CDX File - ' + self.filename diff --git a/pywb/utils/binsearch.py b/pywb/utils/binsearch.py index 96b2e9de..6e676a6d 100644 --- a/pywb/utils/binsearch.py +++ b/pywb/utils/binsearch.py @@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192): return gen_iter(line) +#================================================================= +def iter_range(reader, start, end): + """ + Creates an iterator which iterates over lines where + start <= line < end (end exclusive) + """ + + iter_ = search(reader, start) + +# iter_ = itertools.dropwhile( +# lambda line: line < start, +# search(reader, start)) + + end_iter = itertools.takewhile( + lambda line: line <= end, + iter_) + + return end_iter + + #================================================================= def iter_prefix(reader, key): """ diff --git a/pywb/utils/test/binsearch_test.py b/pywb/utils/test/binsearch_test.py index d35551ec..41a02897 100644 --- a/pywb/utils/test/binsearch_test.py +++ b/pywb/utils/test/binsearch_test.py @@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex >>> print_binsearch_results('org,iana)/time-zones', iter_exact) org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + + +# Range Search (end exclusive) +>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range) +org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz +org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz +org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz +org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz +org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz """ #================================================================= import os -from pywb.utils.binsearch import iter_prefix, iter_exact +from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range from pywb.utils.loaders import SeekableTextFileReader from pywb import get_test_dir @@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func): print line +def print_binsearch_results_range(key, end_key, iter_func): + cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') + + for line in iter_func(cdx, key, end_key): + print line + + if __name__ == "__main__": import doctest doctest.testmod()