mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
binsearch: add range based matching via iter_range()
support for: exact, prefix, host, domain match types
This commit is contained in:
parent
922917a631
commit
a56cbcf62e
@ -2,6 +2,7 @@
|
||||
"""
|
||||
|
||||
import surt
|
||||
import urlparse
|
||||
from cdxobject import CDXException
|
||||
|
||||
|
||||
@ -69,6 +70,109 @@ index.html?a=b?c=)/')
|
||||
return surt
|
||||
|
||||
|
||||
#=================================================================
|
||||
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
"""
|
||||
Canonicalize a url (either with custom canonicalizer or
|
||||
standard canonicalizer with or without surt)
|
||||
|
||||
Then, compute a start and end search url search range
|
||||
for a given match type.
|
||||
|
||||
Support match types:
|
||||
* exact
|
||||
* prefix
|
||||
* host
|
||||
* domain (only available when for surt ordering)
|
||||
|
||||
Examples below:
|
||||
|
||||
# surt ranges
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'exact')
|
||||
('com,example)/path/file.html', 'com,example)/path/file.html!')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
||||
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
||||
('com,example)/', 'com,example*')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'domain')
|
||||
('com,example)/', 'com,example-')
|
||||
|
||||
special case for tld domain range
|
||||
>>> calc_search_range('com', 'domain')
|
||||
('com,', 'com-')
|
||||
|
||||
# non-surt ranges
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
|
||||
('example.com/path/file.html', 'example.com/path/file.html!')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
|
||||
('example.com/path/file.html', 'example.com/path/file.htmm')
|
||||
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
||||
('example.com/', 'example.com0')
|
||||
|
||||
# domain range not supported
|
||||
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
||||
Traceback (most recent call last):
|
||||
Exception: matchType=domain unsupported for non-surt
|
||||
"""
|
||||
def inc_last_char(x):
|
||||
return x[0:-1] + chr(ord(x[-1]) + 1)
|
||||
|
||||
if not url_canon:
|
||||
# make new canon
|
||||
url_canon = UrlCanonicalizer(surt_ordered)
|
||||
else:
|
||||
# ensure surt order matches url_canon
|
||||
surt_ordered = url_canon.surt_ordered
|
||||
|
||||
start_key = url_canon(url)
|
||||
|
||||
if match_type == 'exact':
|
||||
end_key = start_key + '!'
|
||||
|
||||
elif match_type == 'prefix':
|
||||
# add trailing slash if url has it
|
||||
if url.endswith('/') and not start_key.endswith('/'):
|
||||
start_key += '/'
|
||||
|
||||
end_key = inc_last_char(start_key)
|
||||
|
||||
elif match_type == 'host':
|
||||
if surt_ordered:
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
start_key = host + ')/'
|
||||
end_key = host + '*'
|
||||
else:
|
||||
host = urlparse.urlsplit(url).netloc
|
||||
|
||||
start_key = host + '/'
|
||||
end_key = host + '0'
|
||||
|
||||
elif match_type == 'domain':
|
||||
if not surt_ordered:
|
||||
raise Exception('matchType=domain unsupported for non-surt')
|
||||
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
# if tld, use com, as start_key
|
||||
# otherwise, stick with com,example)/
|
||||
if not ',' in host:
|
||||
start_key = host + ','
|
||||
else:
|
||||
start_key = host + ')/'
|
||||
|
||||
end_key = host + '-'
|
||||
else:
|
||||
raise Exception('Invalid match_type: ' + match_type)
|
||||
|
||||
return (start_key, end_key)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
@ -1,4 +1,4 @@
|
||||
from canonicalize import UrlCanonicalizer
|
||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
||||
|
||||
from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
@ -14,8 +14,23 @@ import urlparse
|
||||
#=================================================================
|
||||
class BaseCDXServer(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
ds_rules = kwargs.get('ds_rules')
|
||||
surt_ordered = kwargs.get('surt_ordered', True)
|
||||
|
||||
# load from domain-specific rules
|
||||
if ds_rules:
|
||||
self.url_canon, self.fuzzy_query = (
|
||||
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
||||
# or custom passed in canonicalizer
|
||||
else:
|
||||
self.url_canon = kwargs.get('url_canon')
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
|
||||
# set default canonicalizer if none set thus far
|
||||
if not self.url_canon:
|
||||
self.url_canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
# set perms checker, if any
|
||||
self.perms_checker = kwargs.get('perms_checker')
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, params):
|
||||
@ -77,7 +92,14 @@ class CDXServer(BaseCDXServer):
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
|
||||
params['key'] = self.url_canon(url)
|
||||
#params['key'] = self.url_canon(url)
|
||||
match_type = params.get('matchType', 'exact')
|
||||
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=match_type,
|
||||
url_canon=self.url_canon)
|
||||
params['key'] = key
|
||||
params['end_key'] = end_key
|
||||
|
||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||
|
||||
@ -131,15 +153,6 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if ds_rules_file:
|
||||
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
|
||||
surt_ordered)
|
||||
else:
|
||||
canon, fuzzy = None, None
|
||||
|
||||
if not canon:
|
||||
canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
if (isinstance(paths, str) and
|
||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||
server_cls = RemoteCDXServer
|
||||
@ -147,8 +160,8 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths,
|
||||
url_canon=canon,
|
||||
fuzzy_query=fuzzy,
|
||||
surt_ordered=surt_ordered,
|
||||
ds_rules=ds_rules_file,
|
||||
perms_checker=perms_checker)
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from pywb.utils.binsearch import iter_exact, iter_prefix
|
||||
from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
import urllib
|
||||
@ -24,17 +24,7 @@ class CDXFile(CDXSource):
|
||||
|
||||
def load_cdx(self, params):
|
||||
source = SeekableTextFileReader(self.filename)
|
||||
|
||||
match_type = params.get('matchType')
|
||||
|
||||
if match_type == 'prefix':
|
||||
iter_func = iter_prefix
|
||||
else:
|
||||
iter_func = iter_exact
|
||||
|
||||
key = params.get('key')
|
||||
|
||||
return iter_func(source, key)
|
||||
return iter_range(source, params.get('key'), params.get('end_key'))
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
|
@ -87,6 +87,26 @@ def search(reader, key, prev_size=0, compare_func=cmp, block_size=8192):
|
||||
return gen_iter(line)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def iter_range(reader, start, end):
|
||||
"""
|
||||
Creates an iterator which iterates over lines where
|
||||
start <= line < end (end exclusive)
|
||||
"""
|
||||
|
||||
iter_ = search(reader, start)
|
||||
|
||||
# iter_ = itertools.dropwhile(
|
||||
# lambda line: line < start,
|
||||
# search(reader, start))
|
||||
|
||||
end_iter = itertools.takewhile(
|
||||
lambda line: line <= end,
|
||||
iter_)
|
||||
|
||||
return end_iter
|
||||
|
||||
|
||||
#=================================================================
|
||||
def iter_prefix(reader, key):
|
||||
"""
|
||||
|
@ -25,12 +25,21 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
|
||||
>>> print_binsearch_results('org,iana)/time-zones', iter_exact)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
|
||||
# Range Search (end exclusive)
|
||||
>>> print_binsearch_results_range('org,iana)/about', 'org,iana)/domains', iter_range)
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
org,iana)/about/performance/ietf-draft-status 20140126200815 http://www.iana.org/about/performance/ietf-draft-status text/html 302 Y7CTA2QZUSCDTJCSECZNSPIBLJDO7PJJ - - 584 596566 iana.warc.gz
|
||||
org,iana)/about/performance/ietf-statistics 20140126200804 http://www.iana.org/about/performance/ietf-statistics text/html 302 HNYDN7XRX46RQTT2OFIWXKEYMZQAJWHD - - 582 581890 iana.warc.gz
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz
|
||||
"""
|
||||
|
||||
|
||||
#=================================================================
|
||||
import os
|
||||
from pywb.utils.binsearch import iter_prefix, iter_exact
|
||||
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
@ -45,6 +54,13 @@ def print_binsearch_results(key, iter_func):
|
||||
print line
|
||||
|
||||
|
||||
def print_binsearch_results_range(key, end_key, iter_func):
|
||||
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
||||
|
||||
for line in iter_func(cdx, key, end_key):
|
||||
print line
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
Loading…
x
Reference in New Issue
Block a user