mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
binsearch: add range based matching via iter_range()
support for: exact, prefix, host, domain match types
This commit is contained in:
parent
922917a631
commit
a56cbcf62e
@ -2,6 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import surt
|
import surt
|
||||||
|
import urlparse
|
||||||
from cdxobject import CDXException
|
from cdxobject import CDXException
|
||||||
|
|
||||||
|
|
||||||
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
|
|||||||
return surt
|
return surt
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||||
|
"""
|
||||||
|
Canonicalize a url (either with custom canonicalizer or
|
||||||
|
standard canonicalizer with or without surt)
|
||||||
|
|
||||||
|
Then, compute a start and end search url search range
|
||||||
|
for a given match type.
|
||||||
|
|
||||||
|
Support match types:
|
||||||
|
* exact
|
||||||
|
* prefix
|
||||||
|
* host
|
||||||
|
* domain (only available when for surt ordering)
|
||||||
|
|
||||||
|
Examples below:
|
||||||
|
|
||||||
|
# surt ranges
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'exact')
|
||||||
|
('com,example)/path/file.html', 'com,example)/path/file.html!')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
||||||
|
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
||||||
|
('com,example)/', 'com,example*')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'domain')
|
||||||
|
('com,example)/', 'com,example-')
|
||||||
|
|
||||||
|
special case for tld domain range
|
||||||
|
>>> calc_search_range('com', 'domain')
|
||||||
|
('com,', 'com-')
|
||||||
|
|
||||||
|
# non-surt ranges
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
|
||||||
|
('example.com/path/file.html', 'example.com/path/file.html!')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
|
||||||
|
('example.com/path/file.html', 'example.com/path/file.htmm')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||||
|
('example.com/', 'example.com0')
|
||||||
|
|
||||||
|
# domain range not supported
|
||||||
|
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
Exception: matchType=domain unsupported for non-surt
|
||||||
|
"""
|
||||||
|
def inc_last_char(x):
|
||||||
|
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||||
|
|
||||||
|
if not url_canon:
|
||||||
|
# make new canon
|
||||||
|
url_canon = UrlCanonicalizer(surt_ordered)
|
||||||
|
else:
|
||||||
|
# ensure surt order matches url_canon
|
||||||
|
surt_ordered = url_canon.surt_ordered
|
||||||
|
|
||||||
|
start_key = url_canon(url)
|
||||||
|
|
||||||
|
if match_type == 'exact':
|
||||||
|
end_key = start_key + '!'
|
||||||
|
|
||||||
|
elif match_type == 'prefix':
|
||||||
|
# add trailing slash if url has it
|
||||||
|
if url.endswith('/') and not start_key.endswith('/'):
|
||||||
|
start_key += '/'
|
||||||
|
|
||||||
|
end_key = inc_last_char(start_key)
|
||||||
|
|
||||||
|
elif match_type == 'host':
|
||||||
|
if surt_ordered:
|
||||||
|
host = start_key.split(')/')[0]
|
||||||
|
|
||||||
|
start_key = host + ')/'
|
||||||
|
end_key = host + '*'
|
||||||
|
else:
|
||||||
|
host = urlparse.urlsplit(url).netloc
|
||||||
|
|
||||||
|
start_key = host + '/'
|
||||||
|
end_key = host + '0'
|
||||||
|
|
||||||
|
elif match_type == 'domain':
|
||||||
|
if not surt_ordered:
|
||||||
|
raise Exception('matchType=domain unsupported for non-surt')
|
||||||
|
|
||||||
|
host = start_key.split(')/')[0]
|
||||||
|
|
||||||
|
# if tld, use com, as start_key
|
||||||
|
# otherwise, stick with com,example)/
|
||||||
|
if not ',' in host:
|
||||||
|
start_key = host + ','
|
||||||
|
else:
|
||||||
|
start_key = host + ')/'
|
||||||
|
|
||||||
|
end_key = host + '-'
|
||||||
|
else:
|
||||||
|
raise Exception('Invalid match_type: ' + match_type)
|
||||||
|
|
||||||
|
return (start_key, end_key)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from canonicalize import UrlCanonicalizer
|
from canonicalize import UrlCanonicalizer, calc_search_range
|
||||||
|
|
||||||
from cdxops import cdx_load
|
from cdxops import cdx_load
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||||
@ -14,8 +14,23 @@ import urlparse
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseCDXServer(object):
|
class BaseCDXServer(object):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
|
ds_rules = kwargs.get('ds_rules')
|
||||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
surt_ordered = kwargs.get('surt_ordered', True)
|
||||||
|
|
||||||
|
# load from domain-specific rules
|
||||||
|
if ds_rules:
|
||||||
|
self.url_canon, self.fuzzy_query = (
|
||||||
|
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
||||||
|
# or custom passed in canonicalizer
|
||||||
|
else:
|
||||||
|
self.url_canon = kwargs.get('url_canon')
|
||||||
|
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||||
|
|
||||||
|
# set default canonicalizer if none set thus far
|
||||||
|
if not self.url_canon:
|
||||||
|
self.url_canon = UrlCanonicalizer(surt_ordered)
|
||||||
|
|
||||||
|
# set perms checker, if any
|
||||||
self.perms_checker = kwargs.get('perms_checker')
|
self.perms_checker = kwargs.get('perms_checker')
|
||||||
|
|
||||||
def _check_cdx_iter(self, cdx_iter, params):
|
def _check_cdx_iter(self, cdx_iter, params):
|
||||||
@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer):
|
|||||||
msg = 'A url= param must be specified to query the cdx server'
|
msg = 'A url= param must be specified to query the cdx server'
|
||||||
raise CDXException(msg)
|
raise CDXException(msg)
|
||||||
|
|
||||||
params['key'] = self.url_canon(url)
|
#params['key'] = self.url_canon(url)
|
||||||
|
match_type = params.get('matchType', 'exact')
|
||||||
|
|
||||||
|
key, end_key = calc_search_range(url=url,
|
||||||
|
match_type=match_type,
|
||||||
|
url_canon=self.url_canon)
|
||||||
|
params['key'] = key
|
||||||
|
params['end_key'] = end_key
|
||||||
|
|
||||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||||
|
|
||||||
@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
if ds_rules_file:
|
|
||||||
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
|
|
||||||
surt_ordered)
|
|
||||||
else:
|
|
||||||
canon, fuzzy = None, None
|
|
||||||
|
|
||||||
if not canon:
|
|
||||||
canon = UrlCanonicalizer(surt_ordered)
|
|
||||||
|
|
||||||
if (isinstance(paths, str) and
|
if (isinstance(paths, str) and
|
||||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||||
server_cls = RemoteCDXServer
|
server_cls = RemoteCDXServer
|
||||||
@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
server_cls = CDXServer
|
server_cls = CDXServer
|
||||||
|
|
||||||
return server_cls(paths,
|
return server_cls(paths,
|
||||||
url_canon=canon,
|
surt_ordered=surt_ordered,
|
||||||
fuzzy_query=fuzzy,
|
ds_rules=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from pywb.utils.binsearch import iter_exact, iter_prefix
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
|
|||||||
|
|
||||||
def load_cdx(self, params):
|
def load_cdx(self, params):
|
||||||
source = SeekableTextFileReader(self.filename)
|
source = SeekableTextFileReader(self.filename)
|
||||||
|
return iter_range(source, params.get('key'), params.get('end_key'))
|
||||||
match_type = params.get('matchType')
|
|
||||||
|
|
||||||
if match_type == 'prefix':
|
|
||||||
iter_func = iter_prefix
|
|
||||||
else:
|
|
||||||
iter_func = iter_exact
|
|
||||||
|
|
||||||
key = params.get('key')
|
|
||||||
|
|
||||||
return iter_func(source, key)
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'CDX File - ' + self.filename
|
return 'CDX File - ' + self.filename
|
||||||
|
@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
|
|||||||
return gen_iter(line)
|
return gen_iter(line)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def iter_range(reader, start, end):
|
||||||
|
"""
|
||||||
|
Creates an iterator which iterates over lines where
|
||||||
|
start <= line < end (end exclusive)
|
||||||
|
"""
|
||||||
|
|
||||||
|
iter_ = search(reader, start)
|
||||||
|
|
||||||
|
# iter_ = itertools.dropwhile(
|
||||||
|
# lambda line: line < start,
|
||||||
|
# search(reader, start))
|
||||||
|
|
||||||
|
end_iter = itertools.takewhile(
|
||||||
|
lambda line: line <= end,
|
||||||
|
iter_)
|
||||||
|
|
||||||
|
return end_iter
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def iter_prefix(reader, key):
|
def iter_prefix(reader, key):
|
||||||
"""
|
"""
|
||||||
|
@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
|||||||
|
|
||||||
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
||||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Range Search (end exclusive)
|
||||||
|
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
|
||||||
|
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||||
|
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
|
||||||
|
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
|
||||||
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
|
||||||
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
import os
|
import os
|
||||||
from pywb.utils.binsearch import iter_prefix, iter_exact
|
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func):
|
|||||||
print line
|
print line
|
||||||
|
|
||||||
|
|
||||||
|
def print_binsearch_results_range(key, end_key, iter_func):
|
||||||
|
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
||||||
|
|
||||||
|
for line in iter_func(cdx, key, end_key):
|
||||||
|
print line
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user