mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
webagg: add preliminary 'fuzzy matching' fallback support, currently enabled for all sources
(todo: need to only include sources that support it)
This commit is contained in:
parent
00bdddd1e9
commit
a93f75dca2
@ -3,6 +3,9 @@ from webagg.utils import MementoUtils
|
|||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
|
from pywb.cdx.query import CDXQuery
|
||||||
|
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def to_cdxj(cdx_iter, fields):
|
def to_cdxj(cdx_iter, fields):
|
||||||
@ -22,6 +25,39 @@ def to_link(cdx_iter, fields):
|
|||||||
return content_type, MementoUtils.make_timemap(cdx_iter)
|
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class FuzzyMatcher(object):
|
||||||
|
def __init__(self):
|
||||||
|
res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
|
||||||
|
self.url_canon, self.fuzzy_query = res
|
||||||
|
|
||||||
|
def __call__(self, index_source, params):
|
||||||
|
cdx_iter, errs = index_source(params)
|
||||||
|
return self.do_fuzzy(cdx_iter, index_source, params), errs
|
||||||
|
|
||||||
|
def do_fuzzy(self, cdx_iter, index_source, params):
|
||||||
|
found = False
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
found = True
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
fuzzy_query_params = None
|
||||||
|
if not found:
|
||||||
|
query = CDXQuery(params)
|
||||||
|
fuzzy_query_params = self.fuzzy_query(query)
|
||||||
|
|
||||||
|
if not fuzzy_query_params:
|
||||||
|
return
|
||||||
|
|
||||||
|
fuzzy_query_params.pop('alt_url', '')
|
||||||
|
|
||||||
|
new_iter, errs = index_source(fuzzy_query_params)
|
||||||
|
|
||||||
|
for cdx in new_iter:
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class IndexHandler(object):
|
class IndexHandler(object):
|
||||||
OUTPUTS = {
|
OUTPUTS = {
|
||||||
@ -33,9 +69,10 @@ class IndexHandler(object):
|
|||||||
|
|
||||||
DEF_OUTPUT = 'cdxj'
|
DEF_OUTPUT = 'cdxj'
|
||||||
|
|
||||||
def __init__(self, index_source, opts=None):
|
def __init__(self, index_source, opts=None, *args, **kwargs):
|
||||||
self.index_source = index_source
|
self.index_source = index_source
|
||||||
self.opts = opts or {}
|
self.opts = opts or {}
|
||||||
|
self.fuzzy = FuzzyMatcher()
|
||||||
|
|
||||||
def get_supported_modes(self):
|
def get_supported_modes(self):
|
||||||
return dict(modes=['list_sources', 'index'])
|
return dict(modes=['list_sources', 'index'])
|
||||||
@ -50,7 +87,7 @@ class IndexHandler(object):
|
|||||||
if input_req:
|
if input_req:
|
||||||
params['alt_url'] = input_req.include_post_query(url)
|
params['alt_url'] = input_req.include_post_query(url)
|
||||||
|
|
||||||
return self.index_source(params)
|
return self.fuzzy(self.index_source, params)
|
||||||
|
|
||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
mode = params.get('mode', 'index')
|
mode = params.get('mode', 'index')
|
||||||
|
@ -216,7 +216,8 @@ class LiveWebLoader(BaseLoader):
|
|||||||
'x-archive')
|
'x-archive')
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.sesh = requests.session()
|
#self.sesh = requests.session()
|
||||||
|
self.sesh = requests
|
||||||
|
|
||||||
def load_resource(self, cdx, params):
|
def load_resource(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user