1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

binsearch: add range based matching via iter_range()

support for: exact, prefix, host, domain match types
This commit is contained in:
Ilya Kreymer 2014-02-20 21:21:12 -08:00
parent 922917a631
commit a56cbcf62e
5 changed files with 171 additions and 28 deletions

View File

@ -2,6 +2,7 @@
"""
import surt
import urlparse
from cdxobject import CDXException
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
return surt
#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
"""
Canonicalize a url (either with custom canonicalizer or
standard canonicalizer with or without surt)
Then, compute a start and end search url search range
for a given match type.
Support match types:
* exact
* prefix
* host
* domain (only available when for surt ordering)
Examples below:
# surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact')
('com,example)/path/file.html', 'com,example)/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
>>> calc_search_range('http://example.com/path/file.html', 'domain')
('com,example)/', 'com,example-')
special case for tld domain range
>>> calc_search_range('com', 'domain')
('com,', 'com-')
# non-surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
('example.com/path/file.html', 'example.com/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
('example.com/path/file.html', 'example.com/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last):
Exception: matchType=domain unsupported for non-surt
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
if not url_canon:
# make new canon
url_canon = UrlCanonicalizer(surt_ordered)
else:
# ensure surt order matches url_canon
surt_ordered = url_canon.surt_ordered
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
elif match_type == 'prefix':
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
end_key = inc_last_char(start_key)
elif match_type == 'host':
if surt_ordered:
host = start_key.split(')/')[0]
start_key = host + ')/'
end_key = host + '*'
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
elif match_type == 'domain':
if not surt_ordered:
raise Exception('matchType=domain unsupported for non-surt')
host = start_key.split(')/')[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if not ',' in host:
start_key = host + ','
else:
start_key = host + ')/'
end_key = host + '-'
else:
raise Exception('Invalid match_type: ' + match_type)
return (start_key, end_key)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,4 +1,4 @@
from canonicalize import UrlCanonicalizer
from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
@ -14,8 +14,23 @@ import urlparse
#=================================================================
class BaseCDXServer(object):
def __init__(self, **kwargs):
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
self.fuzzy_query = kwargs.get('fuzzy_query')
ds_rules = kwargs.get('ds_rules')
surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules
if ds_rules:
self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
# or custom passed in canonicalizer
else:
self.url_canon = kwargs.get('url_canon')
self.fuzzy_query = kwargs.get('fuzzy_query')
# set default canonicalizer if none set thus far
if not self.url_canon:
self.url_canon = UrlCanonicalizer(surt_ordered)
# set perms checker, if any
self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, params):
@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
params['key'] = self.url_canon(url)
#params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url,
match_type=match_type,
url_canon=self.url_canon)
params['key'] = key
params['end_key'] = end_key
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None):
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if ds_rules_file:
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
surt_ordered)
else:
canon, fuzzy = None, None
if not canon:
canon = UrlCanonicalizer(surt_ordered)
if (isinstance(paths, str) and
any(paths.startswith(x) for x in ['http://', 'https://'])):
server_cls = RemoteCDXServer
@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None):
server_cls = CDXServer
return server_cls(paths,
url_canon=canon,
fuzzy_query=fuzzy,
surt_ordered=surt_ordered,
ds_rules=ds_rules_file,
perms_checker=perms_checker)

View File

@ -1,4 +1,4 @@
from pywb.utils.binsearch import iter_exact, iter_prefix
from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader
import urllib
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
def load_cdx(self, params):
source = SeekableTextFileReader(self.filename)
match_type = params.get('matchType')
if match_type == 'prefix':
iter_func = iter_prefix
else:
iter_func = iter_exact
key = params.get('key')
return iter_func(source, key)
return iter_range(source, params.get('key'), params.get('end_key'))
def __str__(self):
return 'CDX File - ' + self.filename

View File

@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
return gen_iter(line)
#=================================================================
def iter_range(reader, start, end):
"""
Creates an iterator which iterates over lines where
start <= line < end (end exclusive)
"""
iter_ = search(reader, start)
# iter_ = itertools.dropwhile(
# lambda line: line < start,
# search(reader, start))
end_iter = itertools.takewhile(
lambda line: line <= end,
iter_)
return end_iter
#=================================================================
def iter_prefix(reader, key):
"""

View File

@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# Range Search (end exclusive)
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
"""
#=================================================================
import os
from pywb.utils.binsearch import iter_prefix, iter_exact
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
from pywb.utils.loaders import SeekableTextFileReader
from pywb import get_test_dir
@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func):
print line
def print_binsearch_results_range(key, end_key, iter_func):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
for line in iter_func(cdx, key, end_key):
print line
if __name__ == "__main__":
import doctest
doctest.testmod()